In [None]:
from google.colab import drive
import pandas as pd, numpy as np
from plotly import express as px
import plotly.io as pio

pio.templates.default = 'plotly_white'

#  pip install -U kaleido

drive.mount('/content/drive')

# Helper Methods

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, matthews_corrcoef, confusion_matrix

def prepare_train_test(df):
    ml_binner = MultiLabelBinarizer()
    ml_binner.fit(df[Y_NAME])
    y = ml_binner.transform(df[Y_NAME])
    X = df[FEATURE_NAMES].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test, ml_binner

def get_confusion(y_true, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return TN, TP, FN, FP


def calc_wba_wmcc_conf(y_true, y_pred):
    """Calculated the weighted balanced accuracy and mathews correlation coefficient
    """
    n_lst = []
    ba_lst = []
    mcc_lst = []
    tn_lst = []
    tp_lst = []
    fn_lst = []
    fp_lst = []
    for idx in range(y_true.shape[1]):
        class_true = y_true[:, idx]
        class_preds = y_pred[:, idx]
        N = sum(class_true)
        BA = balanced_accuracy_score(class_true, class_preds, adjusted=False)
        MCC = matthews_corrcoef(class_true, class_preds)
        TN, TP, FN, FP = get_confusion(class_true, class_preds)

        n_lst.append(N)
        mcc_lst.append(MCC)
        ba_lst.append(BA)

        tn_lst.append(TN)
        tp_lst.append(TP)
        fn_lst.append(FN)
        fp_lst.append(FP)

    weighted_balanced_acc = sum([n*ba for n,ba in zip(n_lst,ba_lst)]) / sum(sum(y_true))
    weighted_mcc = sum([n*mcc for n,mcc in zip(n_lst, mcc_lst)]) / sum(sum(y_true))
    sum_tn = sum(tn_lst)
    sum_tp = sum(tp_lst)
    sum_fn = sum(fn_lst)
    sum_fp = sum(fp_lst)

    return weighted_balanced_acc, weighted_mcc, sum_tn, sum_tp, sum_fn, sum_fp

def evaluate(y_true, y_pred):
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    WBA, WMCC, TN, TP, FN, FP = calc_wba_wmcc_conf(y_true, y_pred)
    metrics = {
        "WMCC": WMCC ,
        "micro_f1": micro_f1,
        "macro_f1": macro_f1,
        "WBA": WBA,
        "TN":TN,
        "TP":TP, 
        "FN":FN,
        "FP":FP
    }
    return metrics

def filter_y(df, min_class_size):
    accepted_classes = (df[Y_NAME].value_counts() > min_class_size).replace(False, np.nan).dropna().index.tolist()
    df_filtered = df[df[Y_NAME].isin(accepted_classes)].copy()
    return df_filtered

# Preprocessing


In [9]:
# !unzip -u "/content/drive/MyDrive/CMG - Crystal Prediction Project/Binary Data/BinaryData.zip" -d "/content/"

df_sg = pd.read_excel("/content/BinaryData/Binary_Features_for_SG.xlsx").dropna().drop_duplicates()
df_cs = pd.read_excel("/content/BinaryData/Binary_Features_for_CS.xlsx").dropna().drop_duplicates()
df_pg = pd.read_excel("/content/BinaryData/Binary_Features_for_PG.xlsx").dropna().drop_duplicates()
df_bl = pd.read_excel("/content/BinaryData/Binary_Features_for_BL.xlsx").dropna().drop_duplicates()
print(df_sg.shape)
print(df_cs.shape)
print(df_pg.shape)
print(df_bl.shape)

(29966, 53)
(21552, 17)
(25558, 43)
(27440, 25)


In [10]:
FEATURE_NAMES = df_cs.iloc[:, 1:11].columns.tolist()

# SG
df_sg_med = df_sg.iloc[:, 11:].replace(1, pd.Series(df_sg.columns, df_sg.columns))
df_sg['SG'] = df_sg_med.agg(lambda x: [i for i in list(x) if i != 0][0], axis=1).astype(str)
df_sg = df_sg.loc[:, FEATURE_NAMES+['SG']].copy()

# CS
df_cs_med = df_cs.iloc[:, 11:].replace(1, pd.Series(df_cs.columns, df_cs.columns))
df_cs['CS'] = df_cs_med.agg(lambda x: [i for i in list(x) if i != 0][0], axis=1)
df_cs = df_cs.loc[:, FEATURE_NAMES+['CS']].copy()

# BL
df_bl_med = df_bl.iloc[:, 11:].replace(1, pd.Series(df_bl.columns, df_bl.columns))
df_bl['BL'] = df_bl_med.agg(lambda x: [i for i in list(x) if i != 0][0], axis=1)
df_bl = df_bl.loc[:, FEATURE_NAMES+['BL']].copy()

# PG
df_pg_med = df_pg.iloc[:, 11:].replace(1, pd.Series(df_pg.columns, df_pg.columns))
df_pg['PG'] = df_pg_med.agg(lambda x: [i for i in list(x) if i != 0][0], axis=1)
df_pg = df_pg.loc[:, FEATURE_NAMES+['PG']].copy()


Y_NAME = "SG"
df_filtered_sg = filter_y(df_sg, 50)
df_agg_sg = df_filtered_sg.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

Y_NAME = "PG"
df_filtered_pg = filter_y(df_pg, 50)
df_agg_pg = df_filtered_pg.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

Y_NAME = "BL"
df_filtered_bl = filter_y(df_bl, 50)
df_agg_bl = df_filtered_bl.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

Y_NAME = "CS"
df_filtered_cs = filter_y(df_cs, 50)
df_agg_cs = df_filtered_cs.groupby(FEATURE_NAMES)[Y_NAME].apply(list).reset_index()

In [None]:
sg_count = df_agg_sg["SG"].explode().value_counts()
pg_count = df_agg_pg["PG"].explode().value_counts()
bl_count = df_agg_bl["BL"].explode().value_counts()
cs_count = df_agg_cs["CS"].explode().value_counts()


fig = px.bar(sg_count, title='Space Group Class Distribution', width=1700)
fig.update_xaxes(type='category', title='Label')
fig.update_yaxes(title='Size')
fig.update_traces(marker_color='#29335C')
fig.update_layout(showlegend=False)
fig.write_image("sg_class_distrib.svg")
fig.show()

fig = px.bar(pg_count, title='Point Group Class Distribution', width=1700)
fig.update_xaxes(type='category', title='Label')
fig.update_yaxes(title='Size')
fig.update_traces(marker_color='#DB2B39')
fig.update_layout(showlegend=False)
fig.write_image("pg_class_distrib.svg")
fig.show()

fig = px.bar(bl_count, title='Bravais Lattice Class Distribution', width=1700)
fig.update_xaxes(type='category', title='Label')
fig.update_yaxes(title='Size')
fig.update_traces(marker_color='#539987')
fig.update_layout(showlegend=False)
fig.write_image("bl_class_distrib.svg")
fig.show()

fig = px.bar(cs_count, title='Crystal System Class Distribution', width=1700)
fig.update_xaxes(type='category', title='Label')
fig.update_yaxes(title='Size')
fig.update_traces(marker_color='#F3A712')
fig.update_layout(showlegend=False)
fig.write_image("cs_class_distrib.svg")
fig.show()

# Train

In [None]:
# TRAIN
Y_NAME = "CS"
X_train, X_test, y_train, y_test, ml_binner = prepare_train_test(df_agg_cs)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# EVAL
cs_eval_dict = evaluate(y_test, y_pred)
cs_eval_dict["WMCC"] = ( cs_eval_dict["WMCC"] + 1) / 2
# --------------------------

# TRAIN
Y_NAME = "BL"
X_train, X_test, y_train, y_test, ml_binner = prepare_train_test(df_agg_bl)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# EVAL
bl_eval_dict = evaluate(y_test, y_pred)
bl_eval_dict["WMCC"] = ( bl_eval_dict["WMCC"] + 1) / 2
# --------------------------

# TRAIN
Y_NAME = "PG"
X_train, X_test, y_train, y_test, ml_binner = prepare_train_test(df_agg_pg)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# EVAL
pg_eval_dict = evaluate(y_test, y_pred)
pg_eval_dict["WMCC"] = ( pg_eval_dict["WMCC"] + 1) / 2
# --------------------------

# TRAIN
Y_NAME = "SG"
X_train, X_test, y_train, y_test, ml_binner = prepare_train_test(df_agg_sg)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# EVAL
sg_eval_dict = evaluate(y_test, y_pred)
sg_eval_dict["WMCC"] = ( sg_eval_dict["WMCC"] + 1) / 2
# --------------------------

In [None]:
print(df_agg_cs['CS'].explode().unique().shape[0])
print(pd.DataFrame(cs_eval_dict, index=["CS"])[['WMCC', "WBA"]])
print()

print(df_agg_bl['BL'].explode().unique().shape[0])
print(pd.DataFrame(bl_eval_dict, index=["BL"])[['WMCC', "WBA"]])
print()

print(df_agg_pg['PG'].explode().unique().shape[0])
print(pd.DataFrame(pg_eval_dict, index=["PG"])[['WMCC', "WBA"]])
print()

print(df_agg_sg['SG'].explode().unique().shape[0])
print(pd.DataFrame(sg_eval_dict, index=["SG"])[['WMCC', "WBA"]])
print()

5
        WMCC       WBA
CS  0.829054  0.826354

13
        WMCC      WBA
BL  0.802032  0.79754

12
        WMCC       WBA
PG  0.810062  0.809006

23
        WMCC       WBA
SG  0.806043  0.803483



In [7]:
# pd.concat([df_agg_cs, df_agg_bl, df_agg_pg, df_agg_sg]).iloc[:, 0:10].drop_duplicates().shape[0]/50

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score, matthews_corrcoef
classes = df_agg_cs[Y_NAME].explode().unique()
frame_dict = {
    "Class": [],
    "BA": [],
    "nMCC": [],
    "F1": [],
    "Size": []
}

for idx in range(y_test.shape[1]):
    class_test = y_test[:, idx]
    class_pred = y_pred[:, idx]
    class_name = classes[idx]
    ba = balanced_accuracy_score(class_test, class_pred)
    f1 = f1_score(class_test, class_pred)
    mathew = (matthews_corrcoef(class_test, class_pred) + 1) / 2
    support = sum(y_train[:, idx])
    frame_dict['Class'].append(class_name) 
    frame_dict['BA'].append(ba)
    frame_dict['F1'].append(f1)
    frame_dict['nMCC'].append(mathew)
    frame_dict['Size'].append(support)