In [1]:
# Requires sklearn 0.24; Install in AnacondaPrompt through: conda install scikit-learn=0.24
import numpy as np
import pandas as pd
import sklearn
from inspect import getmembers, isfunction
from rdkit import Chem
from rdkit.Chem import Descriptors

from tqdm.notebook import tqdm
# Add more if needed

# Import Data

In [100]:
X = np.loadtxt("X_train_cleaned.txt")
X_T = np.loadtxt("X_test_cleaned.txt")

# Classification

In [None]:
y = pd.read_csv("train_crystals.csv")["is_centrosymmetric"].astype(int)
y = y.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [28]:
# Evaluation Metric
from sklearn.metrics import f1_score

# Different Classifiers
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV

In [29]:
# General Classifier train and evaluate function, Calibrated due to uneven dataset
def train_classifier(classifier, X_train, X_test, y_train, y_test):
    try:
        cl = classifier(n_jobs=4)
    except:
        cl = classifier()
    cl.fit(X_train, y_train)
    
    pred = cl.predict(X_test)
    f1 = f1_score(y_test, pred, average="macro")
    print("F1: {}".format(f1))
    
    return cl

In [56]:
MLP = train_calibrated_classifier(MLPClassifier, X_train, X_test, y_train, y_test)

F1: 0.6759848493771133


In [59]:
SupportVC = train_calibrated_classifier(SVC, X_train, X_test, y_train, y_test)

F1: 0.5937028518633188


In [55]:
RFF = train_calibrated_classifier(RandomForestClassifier, X_train, X_test, y_train, y_test)

F1: 0.9661170989548981


In [57]:
QDA = train_classifier(QuadraticDiscriminantAnalysis, X_train, X_test, y_train, y_test)



F1: 0.6300146408787296


In [58]:
XGB = train_classifier(XGBClassifier, X_train, X_test, y_train.ravel(), y_test.ravel())



F1: 0.6557498042938756


In [123]:
results = RFF.predict(X_T)
results_df = pd.DataFrame(results)

In [128]:
results_df[0].to_csv("task_2_predictions.csv", index=False, header = False)

# Task 4: Van der Waals

In [158]:
# Wide Outliers => Restrict to 95% quantile
y = pd.read_csv("train_distances.csv")

In [142]:
y["n_vdw_contacts"].describe()

count    13449.000000
mean        12.325154
std          9.013866
min          0.000000
25%          6.000000
50%         11.000000
75%         16.000000
max        152.000000
Name: n_vdw_contacts, dtype: float64

In [143]:
upper_lim = y.quantile(0.95)
y = y["n_vdw_contacts"].to_numpy()

In [149]:
for i, value in enumerate(y):
    if value > 27:
        y[i] = 27.0

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [153]:
# Evaluation Metric
from sklearn.metrics import mean_squared_error

# Import all Regression models
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [154]:
# General Classifier train and evaluate function
def train_classifier(classifier, X_train, X_test, y_train, y_test):
    try:
        cl = classifier(n_jobs=-1, verbose=1)
    except:
        cl = classifier()
    cl.fit(X_train, y_train)
    
    pred = cl.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print("MSE: {}".format(mse))
    
    return cl

In [68]:
KNN = train_classifier(KNeighborsRegressor, X_train, X_test, y_train, y_test)

MSE: 48.99594409753196


In [155]:
RFF = train_classifier(RandomForestRegressor, X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


MSE: 18.677428998323578


In [70]:
MLP = train_classifier(MLPRegressor, X_train, X_test, y_train, y_test)

MSE: 45.78143026631334


In [71]:
XGB = train_classifier(XGBRegressor, X_train, X_test, y_train, y_test)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


MSE: 38.04880208337785


In [156]:
results = RFF.predict(X_T)
results_df = pd.DataFrame(results)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [157]:
results_df[0].to_csv("task_4_predictions.csv", index=False, header=False)

# Task 1: Density

In [75]:
y = pd.read_csv("train_crystals.csv")["calculated_density"].to_numpy()

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [77]:
KNN = train_classifier(KNeighborsRegressor, X_train, X_test, y_train, y_test)

MSE: 0.014823041190540436


In [78]:
RFF = train_classifier(RandomForestRegressor, X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s


MSE: 0.004814517260200323


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [79]:
MLP = train_classifier(MLPRegressor, X_train, X_test, y_train, y_test)

MSE: 0.009630009848725406


In [80]:
XGB = train_classifier(XGBRegressor, X_train, X_test, y_train, y_test)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


MSE: 0.005037749191410411


In [82]:
results = RFF.predict(X_T)
results_df = pd.DataFrame(results)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [83]:
results_df[0].to_csv("task_1_predictions.csv", index=False, header=False)

# Task 3: Centroid distance distribution

In [84]:
y = pd.read_csv("train_centroid_distances.csv")["mean"].to_numpy()

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [86]:
KNN = train_classifier(KNeighborsRegressor, X_train, X_test, y_train, y_test)

MSE: 0.1603578904202768


In [87]:
RFF = train_classifier(RandomForestRegressor, X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


MSE: 0.12030304558746986


In [88]:
MLP = train_classifier(MLPRegressor, X_train, X_test, y_train, y_test)

MSE: 0.23947246427570817


In [89]:
XGB = train_classifier(XGBRegressor, X_train, X_test, y_train, y_test)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


MSE: 0.13278540255366358


In [91]:
results = RFF.predict(X_T)
results_df = pd.DataFrame(results)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [92]:
results_df[0].to_csv("task_3_predictions.csv", index=False, header=False)

# Bonus: Packing Coefficient

In [37]:
X = np.loadtxt("X_train_cleaned.txt")

array([0.66776931, 0.68927154, 0.72906373, ..., 0.71213622, 0.70870525,
       0.73043984])

In [38]:
y = pd.read_csv("train_crystals.csv")["packing_coefficient"].to_numpy()
y

array([0.66776931, 0.68927154, 0.72906373, ..., 0.71213622, 0.70870525,
       0.73043984])

In [39]:
# Split Data Train and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [40]:
KNN = train_classifier(KNeighborsRegressor, X_train, X_test, y_train, y_test)

MSE: 0.0008748744174802632


In [41]:
RFF = train_classifier(RandomForestRegressor, X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


MSE: 0.0006902875132228912


In [42]:
MLP = train_classifier(MLPRegressor, X_train, X_test, y_train, y_test)

MSE: 0.00877791799645195


In [43]:
XGB = train_classifier(XGBRegressor, X_train, X_test, y_train, y_test)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


MSE: 0.0007595619766306949


In [45]:
results = RFF.predict(X_T)
results_df = pd.DataFrame(results)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [46]:
results_df[0].to_csv("bonus_1_predictions.csv", index=False, header=False)

# Bonus: Cell Volume

In [93]:
y = pd.read_csv("train_crystals.csv")["cell_volume"].to_numpy()
y = y/1000

In [94]:
# Split Data Train and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [95]:
KNN = train_classifier(KNeighborsRegressor, X_train, X_test, y_train, y_test)

MSE: 0.25002193147636087


In [96]:
RFF = train_classifier(RandomForestRegressor, X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s


MSE: 0.21413465720305755


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [97]:
MLP = train_classifier(MLPRegressor, X_train, X_test, y_train, y_test)

MSE: 0.3299056541973046


In [98]:
XGB = train_classifier(XGBRegressor, X_train, X_test, y_train, y_test)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


MSE: 0.23135986317868082


In [103]:
results = RFF.predict(X_T)
results = results*1000
results_df = pd.DataFrame(results)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [105]:
results_df[0].to_csv("bonus_2_predictions.csv", index=False, header=False)