In [8]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from collections import defaultdict

# Read the CSV file
df = pd.read_csv('Tg_OOD_MD.csv')

# Define the polymer groups and their characteristic substructures
polymer_groups = {
    'Polyethylene (PE)': '[CH2][CH2][CH2]',
    'Polypropylene (PP)': '[CH2][CH](C)[CH2]',
    'Polyvinyl Chloride (PVC)': '[CH2][CH](Cl)[CH2]',
    'Polystyrene (PS)': '[CH2][CH](c1ccccc1)[CH2]',
    'Polyamide (Nylon)': '[NX3][C](=[O])[C][NX3]',
    'Polyester': '[C](=[O])[O][CH2][CH2][O][C](=[O])',
    'Polyimide (PI)': '[NR0]1[C](=[O])[c]2[cX3][cX3][cX3][cX3][c]2[C](=[O])[N]1',
    'Polyurethane (PU)': '[NX3][C](=[O])[O][CH2]',
    'Polytetrafluoroethylene (PTFE)': '[C](F)(F)[C](F)(F)',
    'Polycarbonate (PC)': '[O][C](=[O])[O][c]1[cX3][cX3][c]([C]([CH3])([CH3]))[cX3][cX3]1',
    'Polyether Ether Ketone (PEEK)': '[O][c]1[cX3][cX3][c]([C](=[O])[c]2[cX3][cX3][c]([O])[cX3][cX3]2)[cX3][cX3]1',
    'Polydimethylsiloxane (PDMS)': '[Si]([CH3])([CH3])[O][Si]([CH3])([CH3])'
}

# Function to count polymer types in a SMILES string
def count_polymer_types(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return defaultdict(int)
    
    counts = defaultdict(int)
    for group, substructure in polymer_groups.items():
        pattern = Chem.MolFromSmarts(substructure)
        if mol.HasSubstructMatch(pattern):
            counts[group] += 1
    return counts

# Apply the function to each SMILES string in the dataframe
polymer_counts = df['Smiles'].apply(count_polymer_types)

# Convert the results to a dataframe
result_df = pd.DataFrame(polymer_counts.tolist(), index=df.index)

# Fill NaN values with 0
result_df = result_df.fillna(0)

# Count the total number for each group
total_counts = result_df.sum().sort_values(ascending=False)

# Print the total counts for each group
print("Total counts for each polymer group:")
for group, count in total_counts.items():
    print(f"{group}: {int(count)}")

# Calculate the percentage of each group
total_polymers = total_counts.sum()
percentages = (total_counts / total_polymers * 100).round(2)

print("\nPercentage of each polymer group:")
for group, percentage in percentages.items():
    print(f"{group}: {percentage}%")

# Save the results to a CSV file
results = pd.DataFrame({
    'Polymer Group': total_counts.index,
    'Count': total_counts.values,
    'Percentage': percentages.values
})
# results.to_csv('polymer_groups_summary.csv', index=False)

# print("\nSummary saved to 'polymer_groups_summary.csv'")

Total counts for each polymer group:
Polyethylene (PE): 232
Polyurethane (PU): 24
Polypropylene (PP): 20
Polyamide (Nylon): 19
Polytetrafluoroethylene (PTFE): 11
Polyester: 3
Polystyrene (PS): 2
Polyether Ether Ketone (PEEK): 1
Polydimethylsiloxane (PDMS): 1

Percentage of each polymer group:
Polyethylene (PE): 74.12%
Polyurethane (PU): 7.67%
Polypropylene (PP): 6.39%
Polyamide (Nylon): 6.07%
Polytetrafluoroethylene (PTFE): 3.51%
Polyester: 0.96%
Polystyrene (PS): 0.64%
Polyether Ether Ketone (PEEK): 0.32%
Polydimethylsiloxane (PDMS): 0.32%


In [12]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# Read the CSV file
df = pd.read_csv('Tg_OOD_MD.csv')

# Define the polymer groups and their characteristic substructures
polymer_groups = {
    'Polyethylene (PE)': '[CH2][CH2][CH2]',
    'Polypropylene (PP)': '[CH2][CH](C)[CH2]'
}

# Function to check if a SMILES string matches a given polymer type
def is_polymer_type(smiles, substructure):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    pattern = Chem.MolFromSmarts(substructure)
    return mol.HasSubstructMatch(pattern)

# Create dataframes for PE and PP
pe_df = df[df['Smiles'].apply(lambda x: is_polymer_type(x, polymer_groups['Polyethylene (PE)']))]
pp_df = df[df['Smiles'].apply(lambda x: is_polymer_type(x, polymer_groups['Polypropylene (PP)']))]

# Save PE data to CSV
pe_df.to_csv('polyethylene_data_MD.csv', index=False)
print(f"Saved {len(pe_df)} Polyethylene (PE) entries to 'polyethylene_data_MD.csv'")

# Save PP data to CSV
pp_df.to_csv('polypropylene_data_MD.csv', index=False)
print(f"Saved {len(pp_df)} Polypropylene (PP) entries to 'polypropylene_data_MD.csv'")

# Print some statistics
print("\nStatistics:")
print(f"Total entries in original dataset: {len(df)}")
print(f"Entries classified as PE: {len(pe_df)} ({len(pe_df)/len(df)*100:.2f}%)")
print(f"Entries classified as PP: {len(pp_df)} ({len(pp_df)/len(df)*100:.2f}%)")

# Check for overlap
overlap = set(pe_df.index) & set(pp_df.index)
if overlap:
    print(f"\nWarning: {len(overlap)} entries are classified as both PE and PP.")
    print("This could be due to complex polymer structures or limitations in the classification method.")
else:
    print("\nNo entries are classified as both PE and PP.")

Saved 232 Polyethylene (PE) entries to 'polyethylene_data_MD.csv'
Saved 20 Polypropylene (PP) entries to 'polypropylene_data_MD.csv'

Statistics:
Total entries in original dataset: 566
Entries classified as PE: 232 (40.99%)
Entries classified as PP: 20 (3.53%)

This could be due to complex polymer structures or limitations in the classification method.
