In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict, LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
import matplotlib.pyplot as plt

import random
from datetime import datetime

In [2]:
solvent_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'solvent')
solvent_data.set_index('solvent_name', inplace = True)
solvent_columns = list(solvent_data.columns)

In [3]:
nucleophile_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'nucleophiles')
nucleophile_data.set_index('nucleophile_name', inplace = True)
nucleophile_columns = list(nucleophile_data.columns)

In [4]:
raw_catalyst_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'model_catalyst')
raw_catalyst_data.rename(columns = {'Unnamed: 1': 'catalyst_name'}, inplace = True)

r_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    r_catalyst_data.iloc[i, 1] = r_catalyst_data.iloc[i, 1].replace('R/S', 'R')

s_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    s_catalyst_data.iloc[i, 1] = s_catalyst_data.iloc[i, 1].replace('R/S', 'S')

catalyst_data = pd.concat([r_catalyst_data, s_catalyst_data])
catalyst_data.set_index('catalyst_name', inplace = True)
catalyst_columns = list(catalyst_data.columns)

In [5]:
raw_catalyst_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'full_catalyst')
raw_catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)

#Taking care of catalyst R/S
r_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    r_catalyst_data.iloc[i, 1] = r_catalyst_data.iloc[i, 1].replace('R/S', 'R')
    #print(r_catalyst_data.iloc[i, 1])
#r_catalyst_data.drop(17, inplace = True)    

s_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    s_catalyst_data.iloc[i, 1] = s_catalyst_data.iloc[i, 1].replace('R/S', 'S')
    #print(s_catalyst_data.iloc[i, 1])
    
catalyst_data = pd.concat([r_catalyst_data, s_catalyst_data])
catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)
catalyst_data.set_index('catalyst_name', inplace = True)

catalyst_columns = list(catalyst_data.columns)

In [6]:
iminium_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'iminiums')
iminium_data.rename(columns = {'imine': 'iminium_name', 'electronic energy difference (kcal/mol) ': 'electronic energy difference (kcal/mol)'}, inplace = True)
#removed space at the end of electronic energy
iminium_data.set_index('iminium_name', inplace = True)
iminium_data.drop(labels = ['Unnamed: 1'], axis = 1, inplace = True)
for i in range(1, 181):
    z_iminium_name = '(Z)-Iminium ' + str(i)
    e_iminium_name = '(E)-Iminium ' + str(i)
    iminium_data.loc[z_iminium_name, 'electronic energy difference (kcal/mol)'] = iminium_data.loc[e_iminium_name, 'electronic energy difference (kcal/mol)']
iminium_columns = list(iminium_data.columns)

In [7]:
class Reaction():
    
    def __init__(self, name, entry, catalyst, nucleophile, substrate, solvent, iminium_type, iminium, majorenantiomer, minorenantiomer, ee, G):
        self.name = name
        self.entry = entry
        self.catalyst = catalyst
        self.nucleophile = nucleophile
        self.substrate = substrate
        self.solvent = solvent
        self.iminium_type = iminium_type
        self.iminium = iminium
        self.majorenantiomer = majorenantiomer
        self.minorenantiomer = minorenantiomer
        self.ee = ee
        self.G = G
            
        self.solvent_properties = dict()
        for column in solvent_columns:
            self.solvent_properties[column] = solvent_data.loc[solvent, column]
            
        self.catalyst_properties = dict()
        for column in catalyst_columns:
            self.catalyst_properties[column] = catalyst_data.loc[catalyst, column]
            
        self.nucleophile_properties = dict()
        for column in nucleophile_columns:
            self.nucleophile_properties[column] = nucleophile_data.loc[nucleophile, column]
        
        self.e_iminium = '(E)-' + str(iminium)
        self.z_iminium = '(Z)-' + str(iminium)
        
        self.e_iminium_properties = dict()
        self.z_iminium_properties = dict()
        for column in iminium_columns:
            self.e_iminium_properties[column] = iminium_data.loc[self.e_iminium, column]
            self.z_iminium_properties[column] = iminium_data.loc[self.z_iminium, column]
            
        
            
    def __repr__(self):
        return "Reaction - {}".format(self.name)
        #iminium stuff

In [8]:
reactions = dict()

def process_data(reaction_number, reaction, iminium_type, sheetname = None):
    reaction_file = '/Users/jlisd/Downloads/AI Chem/AI Chem/reaction info 5-23/' + str(reaction_number) + ' ' + reaction + '.xlsx'
    if sheetname == None:
        data = pd.read_excel(reaction_file)
    else:
        data = pd.read_excel(reaction_file, sheet_name = sheetname)
        
    data.set_index('entry', inplace = True)
    
    entries = len(data)
    for entry in range(1, entries + 1):
        if sheetname == None:
            reaction_name = reaction + ' ' + str(entry)
        else:
            reaction_name = reaction + ' ' + sheetname + ' ' + str(entry)
        
        
        reactions[reaction_name] = Reaction(reaction_name, 
                                            entry,
                                            data.loc[entry, 'Catalyst'],
                                            data.loc[entry, 'Nucleophile'],
                                            data.loc[entry, 'Substrate'],
                                            data.loc[entry, 'Solvent'],
                                            iminium_type,
                                            data.loc[entry, 'Iminium'],
                                            data.loc[entry, 'Major Enantiomer'],
                                            data.loc[entry, 'Minor Enantiomer'],
                                            data.loc[entry, 'ee'],
                                            data.loc[entry, 'ΔΔG‡'])
        


In [12]:
process_data(1, 'Addition of Alcohols', 'E', 'Scope')
process_data(2, 'Addition of thiols', 'E', 'Catalyst & solvent screening da')
process_data(2, 'Addition of thiols', 'E', 'Effect of catalyst loading')
process_data(2, 'Addition of thiols', 'E', 'Imine scope')
process_data(2, 'Addition of thiols', 'E', 'Thiol scope')
process_data(3, 'Hydrophosphonylation of imines', 'E', 'Catalyst screening data')
process_data(3, 'Hydrophosphonylation of imines', 'E', 'Scope')
process_data(4, 'Addition of diazomethylphosphonates', 'E', "Optimization of catalyst and re")
process_data(4, 'Addition of diazomethylphosphonates', 'E', "Imine scope")
process_data(5, 'Addition of diazoacetamides', 'E', 'Catalyst screening data')
process_data(5, 'Addition of diazoacetamides', 'E', 'Solvent screening data')
process_data(5, 'Addition of diazoacetamides', 'E', 'Substrate(s) scope')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Catalyst screening data')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Solvent screening data')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Imine scope')
process_data(7, 'Peroxidation of imines', 'E', 'Catalyst screening data')
process_data(7, 'Peroxidation of imines', 'E', 'Solvent screening data')
process_data(7, 'Peroxidation of imines', 'E', 'Substrate(s) scope')
process_data(8, 'Transfer Hydrogenation of b,g-Alkynyl a-Imino Esters', 'E', 'Catalyst screening and reaction')
process_data(8, 'Transfer Hydrogenation of b,g-Alkynyl a-Imino Esters', 'E', 'Scope')
process_data(9, 'Transfer Hydrogenation of Enamides', 'E', 'Scope')
process_data(10, 'Transfer Hydrogenation of N-aryl imines (List)', 'Z', 'Catalyst screening data')
process_data(10, 'Transfer Hydrogenation of N-aryl imines (List)', 'Z', 'Imine scope')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Catalyst screening data')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Solvent screening data')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Imine scope')
process_data(12, 'Reductive amination of N-aryl imines (Macmillan)', 'Z', 'Reaction optimization')
process_data(13, 'Transfer Hydrogenation of trifluoromethyl ketimines', 'Z', 'Benzothiazoline screening data')
process_data(13, 'Transfer Hydrogenation of trifluoromethyl ketimines', 'Z', 'Imine scope')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Catalyst screening data')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Benzothiazoline screening data')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Imine scope')
process_data(15, 'Reductive amination of aliphatic ketones by benzothiazoline', 'Z', 'Catalyst screening data')
process_data(15, 'Reductive amination of aliphatic ketones by benzothiazoline', 'Z', 'Imine scope')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Benzothiazoline screening data')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Scope 1 benzothiazoline')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Scope 2 dihydropyridine')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Catalyst screening data')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Solvent data')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Imine scope')
#process_data(18, 'Addition of enecarbamates to benzoyl imines')
#process_data(19, 'Hydrogenation of fluorinated alkynyl ketimines')
#process_data(20, 'Addition of thiols to imines (Denmark)')

In [27]:
catalyst = pd.DataFrame(columns = ['Reaction', 'Catalyst'])
catalyst.set_index('Reaction', inplace = True)

for reaction in reactions:
    catalyst.loc[reaction, 'Catalyst'] = reactions[reaction].catalyst
        
#first column is numerical

In [29]:
Y = catalyst

In [14]:
nucleophile = pd.DataFrame(columns = ['Reaction'] + nucleophile_columns)
nucleophile.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in nucleophile_columns:
        nucleophile.loc[reaction, column] = reactions[reaction].nucleophile_properties[column]
        
#first column is numerical

In [15]:
solvent = pd.DataFrame(columns = ['Reaction'] + solvent_columns)
solvent.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in solvent_columns:
        solvent.loc[reaction, column] = reactions[reaction].solvent_properties[column]
        
#first column is numerical

In [16]:
iminium = pd.DataFrame(columns = ['Reaction', "iminium_type"] + iminium_columns)
iminium.set_index('Reaction', inplace = True)

for reaction in reactions:
    if reactions[reaction].iminium_type == 'E':
        iminium.loc[reaction, "iminium_type"] = 'E'
        for column in iminium_columns:
            iminium.loc[reaction, column] = reactions[reaction].e_iminium_properties[column]
    elif reactions[reaction].iminium_type == 'Z':
        iminium.loc[reaction, "iminium_type"] = 'Z'
        for column in iminium_columns:
            iminium.loc[reaction, column] = reactions[reaction].z_iminium_properties[column]
            

In [119]:
print(iminium.shape, solvent.shape, nucleophile.shape, raw_catalyst_data.shape)

(381, 22) (381, 160) (381, 15) (18, 86)


In [56]:
#iminium nucleophile solvent
X_INS = pd.concat([nucleophile.drop(['nucleophile'], axis = 1), 
               solvent.drop(['solvent'], axis = 1),
              iminium.drop(['iminium_type'], axis = 1)], axis = 1)
X_INS.head()

Unnamed: 0_level_0,H (nucleophile),X,Nu,nXH,iXH,HOMO (nucleophile),LUMO (nucleophile),Polarizability (nucleophile),L,B1,...,SL,SB1,SB5,LL,LB1,LB5,HOMO (iminium),LUMO (iminium),Polarizability (iminium),electronic energy difference (kcal/mol)
Reaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Addition of Alcohols Scope 1,0.458,-0.707,-0.707,3899.4,40.1897,-0.35328,0.0626,18.27,3.99,1.4,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 2,0.464,-0.717,-0.717,3893.54,39.6036,-0.34808,0.06264,30.23,5.03,1.2,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 3,0.461,-0.717,-0.717,3877.1,28.7764,-0.34588,0.05777,42.01,5.03,1.13,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 4,0.459,-0.722,-0.722,3859.15,22.2253,-0.34415,0.05141,53.61,5.03,1.44,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 5,0.471,-0.704,-0.704,3851.9,43.4805,-0.3582,0.05955,40.75,3.98,1.4,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67


In [57]:
#iminium nucleophile
X_IN = pd.concat([iminium.drop(['iminium_type'], axis = 1),
                nucleophile.drop(['nucleophile'], axis = 1)], axis = 1)
X_IN.head()

Unnamed: 0_level_0,nNH,i NH,N,H (iminium),C,SubL,SubS,PG,PGL,PGB1,...,iXH,HOMO (nucleophile),LUMO (nucleophile),Polarizability (nucleophile),L,B1,B5,bond distance H-X,H-X-Nu,H-X-CNu
Reaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Addition of Alcohols Scope 1,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,1.7,...,40.1897,-0.35328,0.0626,18.27,3.99,1.4,3.17,0.96,180,180
Addition of Alcohols Scope 2,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,1.7,...,39.6036,-0.34808,0.06264,30.23,5.03,1.2,3.41,0.96,180,180
Addition of Alcohols Scope 3,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,1.7,...,28.7764,-0.34588,0.05777,42.01,5.03,1.13,4.45,0.96,180,180
Addition of Alcohols Scope 4,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,1.7,...,22.2253,-0.34415,0.05141,53.61,5.03,1.44,4.44,0.96,180,180
Addition of Alcohols Scope 5,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,1.7,...,43.4805,-0.3582,0.05955,40.75,3.98,1.4,4.44,0.96,180,180


In [78]:
#iminium
X_I = iminium.drop(['iminium_type'], axis = 1)

############################################################################################################################

End of Data Processsing

############################################################################################################################

In [109]:
test_size = 0.5
random_state = random.seed(datetime.now())
X = X_IN
#X_INS, X_IN, or X_I
Y = catalyst

In [110]:
knn = KNeighborsClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

knn.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = knn.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = knn.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 140/191
test accuracy: 0.7329842931937173
# of correct train predictions: 157/190
train accuracy: 0.8263157894736842


In [111]:
results

Unnamed: 0_level_0,Actual,Predicted
Reaction,Unnamed: 1_level_1,Unnamed: 2_level_1
Transfer Hydrogenation of N-aryl imines by benzothiazoline Imine scope 3,"(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi...","(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi..."
Transfer Hydrogenation of N-aryl imines by benzothiazoline Catalyst screening data 2,"(R)-3,3′-Bis(4-nitrophenyl)-1,1′-binaphthyl-2,...","(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi..."
Strecker Reaction (with aldimines) Catalyst screening data 2,"(R)-3,3'-Bis(4-diphenyl)-1,1'-binaphthyl 2,2'-...","(R)-3,3'-Bis(9-phenanthryl)-1,1'-binaphthalene..."
Addition of thiols Effect of catalyst loading 1,"(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi...","(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi..."
Transfer Hydrogenation of ethyl ketimines Scope 1 benzothiazoline 6,"(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi...","(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi..."
...,...,...
Transfer Hydrogenation of ethyl ketimines Benzothiazoline screening data 2,"(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi...","(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi..."
Transfer Hydrogenation of N-aryl imines (Rueping) Imine scope 1,"(R)-3,3′-Bis[3,5-bis(trifluoromethyl)phenyl]-1...","(R)-3,3'-Bis(1-naphthalenyl)-1,1'-binaphthyl-2..."
Transfer Hydrogenation of N-aryl imines (Rueping) Catalyst screening data 2,"(R)-3,3-Bis(9-anthracenyl)-1,1-binaphthyl-2,2-...","(R)-3,3′-Bis[3,5-bis(trifluoromethyl)phenyl]-1..."
Addition of thiols Imine scope 4,"(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi...","(R)-3,3′-Bis(2,4,6-triisopropylphenyl)-1,1′-bi..."


In [112]:
decision_tree = DecisionTreeClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

decision_tree.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = decision_tree.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = decision_tree.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 142/191
test accuracy: 0.743455497382199
# of correct train predictions: 161/190
train accuracy: 0.8473684210526315


In [113]:
random_forest = RandomForestClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

random_forest.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = random_forest.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = random_forest.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 146/191
test accuracy: 0.7643979057591623
# of correct train predictions: 159/190
train accuracy: 0.8368421052631579


In [114]:
log_reg = linear_model.LogisticRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

log_reg.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = log_reg.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = log_reg.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 113/191
test accuracy: 0.5916230366492147
# of correct train predictions: 121/190
train accuracy: 0.6368421052631579


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [115]:
lda = LinearDiscriminantAnalysis()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

lda.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = lda.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = lda.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 125/191
test accuracy: 0.6544502617801047
# of correct train predictions: 150/190
train accuracy: 0.7894736842105263


In [116]:
clf = svm.SVC()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

clf.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = clf.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = clf.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 82/191
test accuracy: 0.4293193717277487
# of correct train predictions: 59/190
train accuracy: 0.3105263157894737


###############################################################################################################################

Run Model 100 times

###############################################################################################################################

In [86]:
def run_knn(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        knn = KNeighborsClassifier()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        knn.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = knn.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = knn.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [87]:
def run_decision_tree(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        decision_tree = DecisionTreeClassifier()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        decision_tree.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = decision_tree.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = decision_tree.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [88]:
def run_random_forest(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        random_forest = RandomForestClassifier()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        random_forest.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = random_forest.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = random_forest.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        #evaluating most important features
        features = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(random_forest.feature_importances_)], axis = 1)
        features.columns = ['Feature', 'Importance']
        features.set_index('Feature', inplace = True)
        
        if i == 0:
            total_features = features.sort_values(by = ['Importance'], ascending = False)
        else:
            for feature in total_features.index:
                total_features.loc[feature, 'Importance'] += features.loc[feature, 'Importance']
    
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    #which stage each important feature is in
    for feature in total_features.index:
        if feature in solvent_columns:
            total_features.loc[feature, 'type'] = 'solvent'
        elif feature in nucleophile_columns:
            total_features.loc[feature, 'type'] = 'nucleophile'
        elif feature in catalyst_columns:
            total_features.loc[feature, 'type'] = 'catalyst'
        elif feature in iminium_columns:
            total_features.loc[feature, 'type'] = 'iminium'

    total_features = total_features[['type', 'Importance']]
    total_features = total_features.sort_values(by = ['Importance'], ascending = False)
    
    return scores, total_features

In [89]:
def run_log_reg(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        log_reg = linear_model.LogisticRegression()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        log_reg.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = log_reg.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = log_reg.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [90]:
def run_lda(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        lda = LinearDiscriminantAnalysis()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        lda.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = lda.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = lda.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1
                
        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1
        
        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [91]:
def run_clf(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        clf = svm.SVC()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        clf.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        
        #evaluating performance
        Y_pred = clf.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = clf.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1
                
        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1
        
        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [92]:
run_knn(100)

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.539267,0.605263,0.572178
1,2,0.513089,0.578947,0.545932
2,3,0.586387,0.547368,0.566929
3,4,0.560209,0.626316,0.593176
4,5,0.539267,0.621053,0.580052
...,...,...,...,...
96,97,0.518325,0.626316,0.572178
97,98,0.492147,0.610526,0.551181
98,99,0.518325,0.636842,0.577428
99,100,0.544503,0.621053,0.582677


In [93]:
run_decision_tree(100)

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.513089,0.757895,0.635171
1,2,0.560209,0.705263,0.632546
2,3,0.570681,0.747368,0.658793
3,4,0.513089,0.742105,0.627297
4,5,0.565445,0.757895,0.661417
...,...,...,...,...
96,97,0.534031,0.747368,0.640420
97,98,0.560209,0.726316,0.643045
98,99,0.565445,0.742105,0.653543
99,100,0.581152,0.752632,0.666667


In [94]:
score, feature = run_random_forest(100)
score

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.602094,0.742105,0.671916
1,2,0.607330,0.736842,0.671916
2,3,0.602094,0.757895,0.679790
3,4,0.596859,0.757895,0.677165
4,5,0.591623,0.757895,0.674541
...,...,...,...,...
96,97,0.570681,0.715789,0.643045
97,98,0.628272,0.742105,0.685039
98,99,0.612565,0.721053,0.666667
99,100,0.602094,0.726316,0.664042


In [95]:
feature.to_excel('features.xlsx', sheet_name = 'rfclassifier')


In [96]:
run_log_reg(100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.476440,0.463158,0.469816
1,2,0.465969,0.457895,0.461942
2,3,0.418848,0.494737,0.456693
3,4,0.434555,0.473684,0.454068
4,5,0.471204,0.431579,0.451444
...,...,...,...,...
96,97,0.465969,0.463158,0.464567
97,98,0.460733,0.536842,0.498688
98,99,0.439791,0.484211,0.461942
99,100,0.408377,0.526316,0.467192


In [97]:
run_lda(100)

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.539267,0.636842,0.587927
1,2,0.575916,0.594737,0.585302
2,3,0.445026,0.673684,0.559055
3,4,0.507853,0.600000,0.553806
4,5,0.439791,0.621053,0.530184
...,...,...,...,...
96,97,0.544503,0.584211,0.564304
97,98,0.439791,0.626316,0.532808
98,99,0.502618,0.657895,0.580052
99,100,0.523560,0.631579,0.577428


In [98]:
run_clf(100)

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.418848,0.321053,0.370079
1,2,0.376963,0.363158,0.370079
2,3,0.376963,0.363158,0.370079
3,4,0.371728,0.368421,0.370079
4,5,0.356021,0.384211,0.370079
...,...,...,...,...
96,97,0.356021,0.384211,0.370079
97,98,0.345550,0.394737,0.370079
98,99,0.361257,0.378947,0.370079
99,100,0.382199,0.357895,0.370079
