In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/dialnd/imbalanced-algorithms.git

Cloning into 'imbalanced-algorithms'...
remote: Enumerating objects: 428, done.[K
remote: Total 428 (delta 0), reused 0 (delta 0), pack-reused 428[K
Receiving objects: 100% (428/428), 28.65 MiB | 21.88 MiB/s, done.
Resolving deltas: 100% (211/211), done.


In [None]:
!pip install imbalanced-learn
!git clone https://github.com/diliadis/mlsol.git

Cloning into 'mlsol'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 21 (delta 0), reused 2 (delta 0), pack-reused 18[K
Unpacking objects: 100% (21/21), done.


# Preparation

In [None]:
import pandas as pd, numpy as np
from plotly import express as px
#  pip install -U kaleido

In [None]:
df = pd.read_pickle("/content/drive/MyDrive/CMG - Crystal Prediction Project/Ternary Materials Point Group Prediction/Data/NOMAD_2/Classification_Data_8.pkl").dropna()
print(df.shape)
df.head()

(1381099, 24)


Unnamed: 0,Atomic Number 1,Atomic Number 2,Atomic Number 3,Element_1,Element_2,Element_3,Coefficient 1,Coefficient 2,Coefficient 3,formula_reduced,bravais_lattice,crystal_system,space_group_number,point_group,lattice_parameters,Oxidation 1,Oxidation 2,Oxidation 3,IonicRadius_1,IonicRadius_2,IonicRadius_3,IonizationPot1st_1,IonizationPot1st_2,IonizationPot1st_3
0,3,5,41,Li,B,Nb,4,1,1,BLi4Nb,cF,cubic,216,-43m,"{'a': 6.71860936e-10, 'b': 6.71860936e-10, 'c'...",1,-1,-3,0.75544,1.048943,1.842691,5.3917,8.298,6.7589
1,4,5,41,Be,B,Nb,4,1,1,BBe4Nb,cF,cubic,216,-43m,"{'a': 6.119642980000002e-10, 'b': 6.1196429800...",1,-1,-3,0.643658,1.048943,1.842691,9.3226,8.298,6.7589
2,4,5,41,Be,B,Nb,2,1,1,BBe2Nb,cF,cubic,216,-43m,"{'a': 5.458027e-10, 'b': 5.458027e-10, 'c': 5....",2,-1,-3,0.45208,1.048943,1.842691,9.3226,8.298,6.7589
3,4,5,41,Be,B,Nb,2,1,1,BBe2Nb,cF,cubic,225,m-3m,"{'a': 5.420156379999999e-10, 'b': 5.4201563799...",2,-1,-3,0.45208,1.048943,1.842691,9.3226,8.298,6.7589
4,4,5,41,Be,B,Nb,2,1,1,BBe2Nb,mS,monoclinic,12,2/m,"{'a': 4.660283620000524e-10, 'b': 8.41781306e-...",2,-1,-3,0.45208,1.048943,1.842691,9.3226,8.298,6.7589


**Note** Change `trigonal` to `hexagonal`

In [None]:
df['crystal_system'].replace(to_replace="trigonal", value="hexagonal", inplace=True)

Configure

In [None]:
FEATURE_NAMES = [
#     "Atomic Number 1","Atomic Number 2","Atomic Number 3",
    "Coefficient 1", "Coefficient 2", "Coefficient 3", 
    "IonizationPot1st_1", "IonizationPot1st_2", "IonizationPot1st_3", 
    "Oxidation 1", "Oxidation 2", "Oxidation 3", # Used to be electronegativity 
    "IonicRadius_1", "IonicRadius_2", "IonicRadius_3"
]

GROUPERS = [
    "Atomic Number 1","Atomic Number 2","Atomic Number 3",
    "Coefficient 1", "Coefficient 2", "Coefficient 3"
]

# Y_NAME = 'crystal_system'
# Y_NAME = 'bravais_lattice'
# Y_NAME = 'crystal_system'
Y_NAME = 'space_group_number'

Filter

In [None]:
Y_NAME = 'space_group_number'
df[Y_NAME] = df[Y_NAME].astype(int).astype(str)
accepted_classes = (df[Y_NAME].value_counts() > 300).replace(False, np.nan).dropna().index.tolist()
df_filtered = df[df[Y_NAME].isin(accepted_classes)].copy()

print("Previous Data Size:", df.shape[0])
print("New Data Size:", df_filtered.shape[0])
print("Number of Classes Left:", len(accepted_classes))

Previous Data Size: 1381099
New Data Size: 1372271
Number of Classes Left: 50


In [None]:
df_main = df_filtered[FEATURE_NAMES+[Y_NAME]].drop_duplicates().copy()

In [None]:
df_main.head()

Unnamed: 0,Coefficient 1,Coefficient 2,Coefficient 3,IonizationPot1st_1,IonizationPot1st_2,IonizationPot1st_3,Oxidation 1,Oxidation 2,Oxidation 3,IonicRadius_1,IonicRadius_2,IonicRadius_3,space_group_number
0,4,1,1,5.3917,8.298,6.7589,1,-1,-3,0.75544,1.048943,1.842691,216
1,4,1,1,9.3226,8.298,6.7589,1,-1,-3,0.643658,1.048943,1.842691,216
2,2,1,1,9.3226,8.298,6.7589,2,-1,-3,0.45208,1.048943,1.842691,216
3,2,1,1,9.3226,8.298,6.7589,2,-1,-3,0.45208,1.048943,1.842691,225
4,2,1,1,9.3226,8.298,6.7589,2,-1,-3,0.45208,1.048943,1.842691,12


# MLSOL+RFC
https://arxiv.org/abs/1905.00609

In [None]:
from mlsol.MLSOL import MLSOL
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

df_grouped = df_main.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

ml_binner = MultiLabelBinarizer()
ml_binner.fit(df_grouped[Y_NAME])
y = ml_binner.transform(df_grouped[Y_NAME])
X = df_grouped[FEATURE_NAMES].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

mlsol = MLSOL(perc_gen_instances=0.2, k=5)

In [None]:
y_train.shape[1]

50

In [None]:
X_aug, y_aug = mlsol.fit_resample(X_train, y_train)


100%|██████████| 51302/51302 [1:24:07<00:00, 10.16it/s]


# Training & Eval

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_aug, y_aug)

RandomForestClassifier(random_state=0)

In [None]:
y_pred = clf.predict(X_test)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(X_aug, y_aug)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
target_names = ml_binner.classes_
print(classification_report(y_test, y_pred, target_names=[str(i) for i in target_names]))

              precision    recall  f1-score   support

           1       0.61      0.27      0.37       223
          10       0.93      0.98      0.96     12553
         107       0.99      1.00      0.99      4688
          11       0.55      0.24      0.33       114
         119       0.93      0.97      0.95     12325
          12       0.90      0.96      0.93     12619
         123       0.99      0.99      0.99     13785
         127       0.85      0.51      0.64       102
         129       0.71      0.29      0.41       162
         139       0.87      0.93      0.90     11762
          14       0.64      0.33      0.44       427
         140       0.74      0.43      0.54        79
         141       0.61      0.42      0.50       110
         148       0.74      0.73      0.73       274
          15       0.74      0.43      0.55       290
         156       0.96      0.97      0.97      4550
         160       0.37      0.16      0.22      3823
         164       0.72    


Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.

