# Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd, numpy as np
from plotly import express as px
#  pip install -U kaleido

In [None]:
df = pd.read_pickle("/content/drive/MyDrive/CMG - Crystal Prediction Project/Ternary Materials Point Group Prediction/Data/NOMAD_2/Classification_Data_8.pkl").dropna()
print(df.shape)
df.head()

(1381099, 24)


Unnamed: 0,Atomic Number 1,Atomic Number 2,Atomic Number 3,Element_1,Element_2,Element_3,Coefficient 1,Coefficient 2,Coefficient 3,formula_reduced,bravais_lattice,crystal_system,space_group_number,point_group,lattice_parameters,Oxidation 1,Oxidation 2,Oxidation 3,IonicRadius_1,IonicRadius_2,IonicRadius_3,IonizationPot1st_1,IonizationPot1st_2,IonizationPot1st_3
0,3,5,41,Li,B,Nb,4,1,1,BLi4Nb,cF,cubic,216,-43m,"{'a': 6.71860936e-10, 'b': 6.71860936e-10, 'c'...",1,-1,-3,0.75544,1.048943,1.842691,5.3917,8.298,6.7589
1,4,5,41,Be,B,Nb,4,1,1,BBe4Nb,cF,cubic,216,-43m,"{'a': 6.119642980000002e-10, 'b': 6.1196429800...",1,-1,-3,0.643658,1.048943,1.842691,9.3226,8.298,6.7589
2,4,5,41,Be,B,Nb,2,1,1,BBe2Nb,cF,cubic,216,-43m,"{'a': 5.458027e-10, 'b': 5.458027e-10, 'c': 5....",2,-1,-3,0.45208,1.048943,1.842691,9.3226,8.298,6.7589
3,4,5,41,Be,B,Nb,2,1,1,BBe2Nb,cF,cubic,225,m-3m,"{'a': 5.420156379999999e-10, 'b': 5.4201563799...",2,-1,-3,0.45208,1.048943,1.842691,9.3226,8.298,6.7589
4,4,5,41,Be,B,Nb,2,1,1,BBe2Nb,mS,monoclinic,12,2/m,"{'a': 4.660283620000524e-10, 'b': 8.41781306e-...",2,-1,-3,0.45208,1.048943,1.842691,9.3226,8.298,6.7589


**Note** Change `trigonal` to `hexagonal`

In [None]:
df['crystal_system'].replace(to_replace="trigonal", value="hexagonal", inplace=True)

In [None]:
FEATURE_NAMES = [
#     "Atomic Number 1","Atomic Number 2","Atomic Number 3",
    "Coefficient 1", "Coefficient 2", "Coefficient 3", 
    "IonizationPot1st_1", "IonizationPot1st_2", "IonizationPot1st_3", 
    "Oxidation 1", "Oxidation 2", "Oxidation 3", # Used to be electronegativity 
    "IonicRadius_1", "IonicRadius_2", "IonicRadius_3"
]

GROUPERS = [
    "Atomic Number 1","Atomic Number 2","Atomic Number 3",
    "Coefficient 1", "Coefficient 2", "Coefficient 3"
]

# Y_NAME = 'crystal_system'
# Y_NAME = 'bravais_lattice'
# Y_NAME = 'crystal_system'
Y_NAME = 'space_group_number'

Deduplicate per features and label

In [None]:
df_dedup = df[FEATURE_NAMES+[Y_NAME]].drop_duplicates().copy()

In [None]:
df_dedup[Y_NAME].unique().shape

(207,)

Group per features only

In [None]:
# df_grouped = df_dedup.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

Count Distribution

In [None]:
value_counts = df_dedup[Y_NAME].value_counts()
value_counts.index = value_counts.index.astype(str)
fig = px.bar(x=value_counts.index, y=value_counts.values)
fig.update_layout( 
    width=3000,
    yaxis=dict(title='Count'),
    xaxis=dict(
        title='Space Group',
        tickmode='linear',
        tickangle=60,
        type='category'
    )
)

Encode y variable

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

ml_binner = MultiLabelBinarizer()
ml_binner.fit(df_grouped[Y_NAME])
y = ml_binner.transform(df_grouped[Y_NAME])

# Training

In [None]:
X = df_grouped[FEATURE_NAMES].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
# from sklearn.metrics import classification_report
# target_names = ml_binner.classes_
# print(classification_report(y_test, y_pred, target_names=[str(i) for i in target_names]))

# SG + Filtered Classes

In [None]:
accepted_classes = (df_dedup[Y_NAME].value_counts() > 300).replace(False, np.nan).dropna().index.tolist()
df_filtered = df_dedup[df_dedup[Y_NAME].isin(accepted_classes)].copy()

print("Previous Data Size:", df_dedup.shape[0])
print("New Data Size:", df_filtered.shape[0])
print("Number of Classes Left:", len(accepted_classes))

Previous Data Size: 1381099
New Data Size: 1372271
Number of Classes Left: 50


In [None]:

df_dedup.shape[0] - df_filtered.shape[0]
# df_filtered.shape[0]

8828

In [None]:
value_counts = df_filtered[Y_NAME].value_counts()
value_counts.index = value_counts.index.astype(str)
fig = px.bar(x = value_counts.index, y=value_counts.values)
fig.update_layout( 
    width=1000,
    yaxis=dict(title='Count'),
    xaxis=dict(
        title='Space Group',
        tickmode='linear',
        tickangle=60,
        type='category'
    )
)

fig.show()

**Training**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

df_grouped2 = df_filtered.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

In [None]:
ml_binner = MultiLabelBinarizer()
ml_binner.fit(df_grouped2[Y_NAME])
y = ml_binner.transform(df_grouped2[Y_NAME])
X = df_grouped2[FEATURE_NAMES].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
# from sklearn.metrics import classification_report
# target_names = ml_binner.classes_
# print(classification_report(y_test, y_pred, target_names=[str(i) for i in target_names]))

# SG Filtered + Edge


In [None]:
Y_NAME = 'space_group_number'
df[Y_NAME] = df[Y_NAME].astype(int).astype(str)
accepted_classes = (df[Y_NAME].value_counts() > 300).replace(False, np.nan).dropna().index.tolist()
df_filtered = df[df[Y_NAME].isin(accepted_classes)].copy()

print("Previous Data Size:", df.shape[0])
print("New Data Size:", df_filtered.shape[0])
print("Number of Classes Left:", len(accepted_classes))

Previous Data Size: 1381099
New Data Size: 1372271
Number of Classes Left: 50


In [None]:
print(set(df['bravais_lattice'].unique()) - set(df_filtered['bravais_lattice'].unique()))
df['bravais_lattice'].value_counts()['cI']

{'cI'}


328

In [None]:
df_filtered['space_group_number'].unique().shape
df_filtered['bravais_lattice'].unique().shape

(13,)

In [None]:
Y_NAME = 'bl_sg'

df_filtered[Y_NAME] = df_filtered['bravais_lattice'].astype(str) + '-AND-' + df_filtered['space_group_number'].astype(str)

df_dedup = df_filtered[FEATURE_NAMES+[Y_NAME]].drop_duplicates().copy()

In [None]:
df_dedup[Y_NAME].str.split("-AND-").apply(lambda x: x[1]).unique().shape

(50,)

In [None]:
accepted_classes2 = (df_dedup[Y_NAME].value_counts() > 0).replace(False, np.nan).dropna().index.tolist()
df_filtered_bl_sg = df_dedup[df_dedup[Y_NAME].isin(accepted_classes2)].copy()
df_filtered_bl_sg[Y_NAME].unique().shape

(50,)

In [None]:
value_counts = df_filtered_bl_sg[Y_NAME].value_counts()
value_counts.index = value_counts.index.astype(str)
fig = px.bar(x = value_counts.index, y=value_counts.values)
fig.update_layout( 
    width=1000,
    yaxis=dict(title='Count'),
    xaxis=dict(
        title='Bravais Lattice + Space Group',
        tickmode='linear',
        tickangle=60,
        type='category'
    )
)

fig.show()

In [None]:
# df_filtered_bl_sg['space_group_number'].unique().shape

In [None]:
df_filtered_bl_sg.head()
df_filtered_bl_sg.shape

(1372271, 13)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

df_grouped2 = df_filtered_bl_sg.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

In [None]:
ml_binner = MultiLabelBinarizer()
ml_binner.fit(df_grouped2[Y_NAME])
y = ml_binner.transform(df_grouped2[Y_NAME])
X = df_grouped2[FEATURE_NAMES].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [None]:
y_pred = clf.predict(X_test)


In [None]:
from sklearn.metrics import classification_report
target_names = ml_binner.classes_
print(classification_report(y_test, y_pred, target_names=[str(i) for i in target_names]))

              precision    recall  f1-score   support

    aP-AND-1       0.65      0.23      0.34       223
    aP-AND-2       0.70      0.27      0.39       275
  cF-AND-216       1.00      1.00      1.00     30627
  cF-AND-225       1.00      1.00      1.00     21313
  cF-AND-227       0.91      0.88      0.89       403
  cP-AND-221       1.00      1.00      1.00     20476
  hP-AND-156       0.96      0.97      0.97      4550
  hP-AND-164       0.81      0.36      0.50       195
  hP-AND-186       0.81      0.39      0.53        90
  hP-AND-187       0.72      0.60      0.66      1703
  hP-AND-189       0.86      0.36      0.51       176
  hP-AND-191       0.92      0.55      0.69       102
  hP-AND-193       0.96      0.68      0.80        78
  hP-AND-194       0.77      0.35      0.48       313
  hR-AND-148       0.77      0.70      0.74       274
  hR-AND-160       0.37      0.15      0.21      3823
  hR-AND-166       0.99      0.99      0.99     13762
   mP-AND-10       0.93    


Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.

