In [1]:
import pandas as pd
import pickle

train = pd.read_csv('train.csv')

train.head()

Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,...,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2,EC1,EC2,EC3,EC4,EC5,EC6
0,0,323.390782,9.879918,5.875576,5.875576,4.304757,4.304757,2.754513,1.749203,0.0,...,4.794537,35.527357,0,0,1,1,0,0,0,0
1,1,273.723798,7.259037,4.441467,5.834958,3.285046,4.485235,2.201375,1.289775,45.135471,...,13.825658,44.70731,0,0,0,1,1,0,0,0
2,2,521.643822,10.911303,8.527859,11.050864,6.665291,9.519706,5.824822,1.770579,15.645394,...,17.964475,45.66012,0,0,1,1,0,0,1,0
3,3,567.431166,12.453343,7.089119,12.833709,6.478023,10.978151,7.914542,3.067181,95.639554,...,31.961948,87.509997,0,0,1,1,0,0,0,0
4,4,112.770735,4.414719,2.866236,2.866236,1.875634,1.875634,1.03645,0.727664,17.980451,...,9.589074,33.333333,2,2,1,0,1,1,1,0


In [12]:
featuresEC1 = ['BertzCT', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'HeavyAtomMolWt', 'MinEStateIndex',
               'NumHeteroatoms', 'PEOE_VSA14', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3']
featuresEC2 = ['BertzCT', 'Chi1', 'Chi2n', 'ExactMolWt', 'FpDensityMorgan3',
               'MinEStateIndex', 'PEOE_VSA14', 'SMR_VSA10', 'fr_COO', 'fr_COO2']
X1 = train[featuresEC1]
X2 = train[featuresEC2]


In [3]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

classifiers = {
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=0),
    "Bagging": BaggingClassifier(n_estimators=10, random_state=0),
    "ExtraTrees (Gini)": ExtraTreesClassifier(criterion="gini", n_estimators=100, random_state=0),
    "ExtraTrees (Entropy)": ExtraTreesClassifier(criterion="entropy", n_estimators=100, random_state=0),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    "RandomForest": RandomForestClassifier(max_depth=2, random_state=0),
    "HistGradientBoosting": HistGradientBoostingClassifier(),
    "PassiveAggressive": PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3),
    "Ridge": RidgeClassifier(),
    "BernoulliNaiveBayes": BernoulliNB(),
    "GaussianNaiveBayes": GaussianNB(),
    "KNeighbors": KNeighborsClassifier(n_neighbors=3),
    "NearestCentroid": NearestCentroid(),
    "DecisionTree": DecisionTreeClassifier(random_state=0),
    "ExtraTree": ExtraTreeClassifier(random_state=0)
}

In [13]:
def test_classifier(classifier_type, X_train, X_test, y_train, y_test, t):
    print(f"[{t}] Classifier", classifier_type, "training...")
    clf = classifiers[classifier_type]
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predicted) * 100
    print(f"[{t}] Classifier", classifier_type, "completed with accuracy", accuracy, '%')
    pickle.dump(
        clf, open(f'trainedmodels{t}/{classifier_type}.pkl', 'wb'))
    # to read the model use below code
    # with open('filename.pkl', 'rb') as f:
    #     clf = pickle.load(f)
    return [classifier_type, accuracy]


def test_classifiers(X_train, X_test, y_train, y_test, t):
    result_queue = []
    multiple_results = [
        (test_classifier(key, X_train, X_test, y_train, y_test, t)) for key in classifiers]
    for res in multiple_results:
        if res:
            try:
                tmp = res[0]
                if tmp is not None:
                    result_queue.append(tmp)
            except TimeoutError:
                print(f"\n[{t}] Classifier", res[1], "exceeded the time limit.")
            except MemoryError:
                print(f"\n[{t}] Classifier", res[1], "exceeded the memory limit.")
    
    accuracy = {}
    for value in multiple_results:
        accuracy[value[0]] = value[1]
    accuracy = {k: v for k, v in sorted(
        accuracy.items(), key=lambda item: item[1], reverse=True)}

    returning = "---"*20
    returning += f"\n[{t}] Results (larger accuracy better): "

    i = 1
    for key in accuracy:
        returnin = str(i).zfill(2) + ' ' + key + ' ' + '{:.2f}'.format(accuracy[key]) + '%'
        returning += "\n" + returnin
        i += 1
    returning += "\n" + "---"*20
    return returning

y = train[['EC1']]
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=2023)
output = test_classifiers(X_train, X_test, y_train, y_test, "EC1")
with open("trainedmodelsEC1/results.txt", "w") as f:
    f.write(output)

y = train[['EC2']]
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=2024)
output = test_classifiers(X_train, X_test, y_train, y_test, "EC2")
with open("trainedmodelsEC2/results.txt", "w") as f:
    f.write(output)

[EC1] Classifier AdaBoost training...
[EC1] Classifier AdaBoost completed with accuracy 69.20485175202157 %
[EC1] Classifier Bagging training...
[EC1] Classifier Bagging completed with accuracy 64.72371967654986 %
[EC1] Classifier ExtraTrees (Gini) training...
[EC1] Classifier ExtraTrees (Gini) completed with accuracy 67.55390835579514 %
[EC1] Classifier ExtraTrees (Entropy) training...
[EC1] Classifier ExtraTrees (Entropy) completed with accuracy 67.52021563342318 %
[EC1] Classifier GradientBoosting training...
[EC1] Classifier GradientBoosting completed with accuracy 69.00269541778977 %
[EC1] Classifier RandomForest training...
[EC1] Classifier RandomForest completed with accuracy 65.76819407008087 %
[EC1] Classifier HistGradientBoosting training...
[EC1] Classifier HistGradientBoosting completed with accuracy 69.5754716981132 %
[EC1] Classifier PassiveAggressive training...
[EC1] Classifier PassiveAggressive completed with accuracy 65.06064690026953 %
[EC1] Classifier Ridge training

In [14]:
with open('trainedmodelsEC1/HistGradientBoosting.pkl', 'rb') as f:
    model1 = pickle.load(f)

with open('trainedmodelsEC2/AdaBoost.pkl', 'rb') as f:
    model2 = pickle.load(f)

test = pd.read_csv('test.csv', index_col='id')

test['EC1'] = model1.predict(test[featuresEC1])
test['EC2'] = model2.predict(test[featuresEC2])

# save to submission.csv
output = "id,EC1,EC2\n"
for index, row in test.iterrows():
    output += str(index) + ',' + str(row['EC1']) + ',' + str(row['EC2']) + '\n'

with open("submission.csv", "w") as f:
    f.write(output)