In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict, LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
import matplotlib.pyplot as plt

import random
from datetime import datetime

In [2]:
solvent_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'solvent')
solvent_data.set_index('solvent_name', inplace = True)
solvent_columns = list(solvent_data.columns)

In [3]:
nucleophile_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'nucleophiles')
nucleophile_data.set_index('nucleophile_name', inplace = True)
nucleophile_columns = list(nucleophile_data.columns)

In [4]:
raw_catalyst_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'model_catalyst')
raw_catalyst_data.rename(columns = {'Unnamed: 1': 'catalyst_name'}, inplace = True)

r_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    r_catalyst_data.iloc[i, 1] = r_catalyst_data.iloc[i, 1].replace('R/S', 'R')

s_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    s_catalyst_data.iloc[i, 1] = s_catalyst_data.iloc[i, 1].replace('R/S', 'S')

catalyst_data = pd.concat([r_catalyst_data, s_catalyst_data])
catalyst_data.set_index('catalyst_name', inplace = True)
catalyst_columns = list(catalyst_data.columns)

In [5]:
raw_catalyst_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'full_catalyst')
raw_catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)

#Taking care of catalyst R/S
r_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    r_catalyst_data.iloc[i, 1] = r_catalyst_data.iloc[i, 1].replace('R/S', 'R')
    #print(r_catalyst_data.iloc[i, 1])
#r_catalyst_data.drop(17, inplace = True)    

s_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    s_catalyst_data.iloc[i, 1] = s_catalyst_data.iloc[i, 1].replace('R/S', 'S')
    #print(s_catalyst_data.iloc[i, 1])
    
catalyst_data = pd.concat([r_catalyst_data, s_catalyst_data])
catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)
catalyst_data.set_index('catalyst_name', inplace = True)

catalyst_columns = list(catalyst_data.columns)

In [6]:
iminium_data = pd.read_excel('/Users/jlisd/Downloads/AI Chem/AI Chem/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'iminiums')
iminium_data.rename(columns = {'imine': 'iminium_name', 'electronic energy difference (kcal/mol) ': 'electronic energy difference (kcal/mol)'}, inplace = True)
#removed space at the end of electronic energy
iminium_data.set_index('iminium_name', inplace = True)
iminium_data.drop(labels = ['Unnamed: 1'], axis = 1, inplace = True)
for i in range(1, 181):
    z_iminium_name = '(Z)-Iminium ' + str(i)
    e_iminium_name = '(E)-Iminium ' + str(i)
    iminium_data.loc[z_iminium_name, 'electronic energy difference (kcal/mol)'] = iminium_data.loc[e_iminium_name, 'electronic energy difference (kcal/mol)']
iminium_columns = list(iminium_data.columns)

In [7]:
class Reaction():
    
    def __init__(self, name, entry, catalyst, nucleophile, substrate, solvent, iminium_type, iminium, majorenantiomer, minorenantiomer, ee, G):
        self.name = name
        self.entry = entry
        self.catalyst = catalyst
        self.nucleophile = nucleophile
        self.substrate = substrate
        self.solvent = solvent
        self.iminium_type = iminium_type
        self.iminium = iminium
        self.majorenantiomer = majorenantiomer
        self.minorenantiomer = minorenantiomer
        self.ee = ee
        self.G = G
            
        self.solvent_properties = dict()
        for column in solvent_columns:
            self.solvent_properties[column] = solvent_data.loc[solvent, column]
            
        self.catalyst_properties = dict()
        for column in catalyst_columns:
            self.catalyst_properties[column] = catalyst_data.loc[catalyst, column]
            
        self.nucleophile_properties = dict()
        for column in nucleophile_columns:
            self.nucleophile_properties[column] = nucleophile_data.loc[nucleophile, column]
        
        self.e_iminium = '(E)-' + str(iminium)
        self.z_iminium = '(Z)-' + str(iminium)
        
        self.e_iminium_properties = dict()
        self.z_iminium_properties = dict()
        for column in iminium_columns:
            self.e_iminium_properties[column] = iminium_data.loc[self.e_iminium, column]
            self.z_iminium_properties[column] = iminium_data.loc[self.z_iminium, column]
            
        
            
    def __repr__(self):
        return "Reaction - {}".format(self.name)
        #iminium stuff

In [8]:
reactions = dict()

def process_data(reaction_number, reaction, iminium_type, sheetname = None):
    reaction_file = '/Users/jlisd/Downloads/AI Chem/AI Chem/reaction info 5-23/' + str(reaction_number) + ' ' + reaction + '.xlsx'
    if sheetname == None:
        data = pd.read_excel(reaction_file)
    else:
        data = pd.read_excel(reaction_file, sheet_name = sheetname)
        
    data.set_index('entry', inplace = True)
    
    entries = len(data)
    for entry in range(1, entries + 1):
        if sheetname == None:
            reaction_name = reaction + ' ' + str(entry)
        else:
            reaction_name = reaction + ' ' + sheetname + ' ' + str(entry)
        
        
        reactions[reaction_name] = Reaction(reaction_name, 
                                            entry,
                                            data.loc[entry, 'Catalyst'],
                                            data.loc[entry, 'Nucleophile'],
                                            data.loc[entry, 'Substrate'],
                                            data.loc[entry, 'Solvent'],
                                            iminium_type,
                                            data.loc[entry, 'Iminium'],
                                            data.loc[entry, 'Major Enantiomer'],
                                            data.loc[entry, 'Minor Enantiomer'],
                                            data.loc[entry, 'ee'],
                                            data.loc[entry, 'ΔΔG‡'])
        


In [9]:
process_data(1, 'Addition of Alcohols', 'E', 'Scope')
process_data(2, 'Addition of thiols', 'E', 'Catalyst & solvent screening da')
process_data(2, 'Addition of thiols', 'E', 'Effect of catalyst loading')
process_data(2, 'Addition of thiols', 'E', 'Imine scope')
process_data(2, 'Addition of thiols', 'E', 'Thiol scope')
process_data(3, 'Hydrophosphonylation of imines', 'E', 'Catalyst screening data')
process_data(3, 'Hydrophosphonylation of imines', 'E', 'Scope')
process_data(4, 'Addition of diazomethylphosphonates', 'E', "Optimization of catalyst and re")
process_data(4, 'Addition of diazomethylphosphonates', 'E', "Imine scope")
process_data(5, 'Addition of diazoacetamides', 'E', 'Catalyst screening data')
process_data(5, 'Addition of diazoacetamides', 'E', 'Solvent screening data')
process_data(5, 'Addition of diazoacetamides', 'E', 'Substrate(s) scope')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Catalyst screening data')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Solvent screening data')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Imine scope')
process_data(7, 'Peroxidation of imines', 'E', 'Catalyst screening data')
process_data(7, 'Peroxidation of imines', 'E', 'Solvent screening data')
process_data(7, 'Peroxidation of imines', 'E', 'Substrate(s) scope')
process_data(8, 'Transfer Hydrogenation of b,g-Alkynyl a-Imino Esters', 'E', 'Catalyst screening and reaction')
process_data(8, 'Transfer Hydrogenation of b,g-Alkynyl a-Imino Esters', 'E', 'Scope')
process_data(9, 'Transfer Hydrogenation of Enamides', 'E', 'Scope')
process_data(10, 'Transfer Hydrogenation of N-aryl imines (List)', 'Z', 'Catalyst screening data')
process_data(10, 'Transfer Hydrogenation of N-aryl imines (List)', 'Z', 'Imine scope')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Catalyst screening data')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Solvent screening data')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Imine scope')
process_data(12, 'Reductive amination of N-aryl imines (Macmillan)', 'Z', 'Reaction optimization')
process_data(13, 'Transfer Hydrogenation of trifluoromethyl ketimines', 'Z', 'Benzothiazoline screening data')
process_data(13, 'Transfer Hydrogenation of trifluoromethyl ketimines', 'Z', 'Imine scope')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Catalyst screening data')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Benzothiazoline screening data')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Imine scope')
process_data(15, 'Reductive amination of aliphatic ketones by benzothiazoline', 'Z', 'Catalyst screening data')
process_data(15, 'Reductive amination of aliphatic ketones by benzothiazoline', 'Z', 'Imine scope')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Benzothiazoline screening data')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Scope 1 benzothiazoline')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Scope 2 dihydropyridine')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Catalyst screening data')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Solvent data')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Imine scope')
#process_data(18, 'Addition of enecarbamates to benzoyl imines')
#process_data(19, 'Hydrogenation of fluorinated alkynyl ketimines')
#process_data(20, 'Addition of thiols to imines (Denmark)')

In [10]:
catalyst = pd.DataFrame(columns = ['Reaction'] + catalyst_columns)
catalyst.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in catalyst_columns:
        catalyst.loc[reaction, column] = reactions[reaction].catalyst_properties[column]
        
#first column is numerical

In [11]:
nucleophile = pd.DataFrame(columns = ['Reaction'] + nucleophile_columns)
nucleophile.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in nucleophile_columns:
        nucleophile.loc[reaction, column] = reactions[reaction].nucleophile_properties[column]
        
#first column is numerical

In [12]:
solvent = pd.DataFrame(columns = ['Reaction'] + solvent_columns)
solvent.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in solvent_columns:
        solvent.loc[reaction, column] = reactions[reaction].solvent_properties[column]
        
#first column is numerical

In [13]:
iminium = pd.DataFrame(columns = ['Reaction', "iminium_type"] + iminium_columns)
iminium.set_index('Reaction', inplace = True)

for reaction in reactions:
    if reactions[reaction].iminium_type == 'E':
        iminium.loc[reaction, "iminium_type"] = 'E'
        for column in iminium_columns:
            iminium.loc[reaction, column] = reactions[reaction].e_iminium_properties[column]
    elif reactions[reaction].iminium_type == 'Z':
        iminium.loc[reaction, "iminium_type"] = 'Z'
        for column in iminium_columns:
            iminium.loc[reaction, column] = reactions[reaction].z_iminium_properties[column]
            

In [14]:
iminium.head()

Unnamed: 0_level_0,iminium_type,nNH,i NH,N,H (iminium),C,SubL,SubS,PG,PGL,...,SL,SB1,SB5,LL,LB1,LB5,HOMO (iminium),LUMO (iminium),Polarizability (iminium),electronic energy difference (kcal/mol)
Reaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Addition of Alcohols Scope 1,E,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 2,E,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 3,E,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 4,E,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67
Addition of Alcohols Scope 5,E,3569.31,83.7481,-0.533,0.427,0.306,-0.2,0.255,0.69,6.63,...,2.58,1.09,1.09,6.76,1.7,3.39,-0.44273,-0.23633,180.99,6.67


In [15]:
Y = iminium.loc[:, ['iminium_type']]

#0 for E, 1 for Z
numerical = {'E': 0, 'Z': 1}
#for reaction in reactions:
 #   Y.loc[reaction, 'iminium_type'] = numerical[Y.loc[reaction, 'iminium_type']]

In [16]:
Y

Unnamed: 0_level_0,iminium_type
Reaction,Unnamed: 1_level_1
Addition of Alcohols Scope 1,E
Addition of Alcohols Scope 2,E
Addition of Alcohols Scope 3,E
Addition of Alcohols Scope 4,E
Addition of Alcohols Scope 5,E
...,...
Strecker Reaction (with ketimines) Imine scope 6,Z
Strecker Reaction (with ketimines) Imine scope 7,Z
Strecker Reaction (with ketimines) Imine scope 8,Z
Strecker Reaction (with ketimines) Imine scope 9,Z


In [17]:
X = pd.concat([catalyst.drop(['Ar group'], axis = 1), 
               nucleophile.drop(['nucleophile'], axis = 1), 
               solvent.drop(['solvent'], axis = 1)], axis = 1)
X.head()

Unnamed: 0_level_0,arhs,alhs,aavg,b,AREA(q),sin(AREA),Lwhole R,B1whole R,B5whole R,L2 R,...,CLOGP,MR,CMR,RMM,Density,no. of H bond donor,no. H bond acceptor,Polarizability (solvent),HOMO (solvent),LUMO (solvent)
Reaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Addition of Alcohols Scope 1,78.854,78.853,78.8535,55.161,61,-0.966118,6.94,1.73,5.72,9.12,...,0.711,22.2,2.2214,88.11,0.902,0,2,54.4,-0.35344,0.05588
Addition of Alcohols Scope 2,78.854,78.853,78.8535,55.161,61,-0.966118,6.94,1.73,5.72,9.12,...,0.711,22.2,2.2214,88.11,0.902,0,2,54.4,-0.35344,0.05588
Addition of Alcohols Scope 3,78.854,78.853,78.8535,55.161,61,-0.966118,6.94,1.73,5.72,9.12,...,0.711,22.2,2.2214,88.11,0.902,0,2,54.4,-0.35344,0.05588
Addition of Alcohols Scope 4,78.854,78.853,78.8535,55.161,61,-0.966118,6.94,1.73,5.72,9.12,...,0.711,22.2,2.2214,88.11,0.902,0,2,54.4,-0.35344,0.05588
Addition of Alcohols Scope 5,78.854,78.853,78.8535,55.161,61,-0.966118,6.94,1.73,5.72,9.12,...,0.711,22.2,2.2214,88.11,0.902,0,2,54.4,-0.35344,0.05588


In [18]:
X_oos = pd.read_excel('X_oos.xlsx', index = 'Reaction')
Y_oos = pd.read_excel('Y_oos.xlsx', index = 'Reaction')

X_oos.set_index('Reaction', inplace = True)
Y_oos.set_index('Reaction', inplace = True)

############################################################################################################################

End of Data Processsing

############################################################################################################################

In [19]:
test_size = 0.5
random_state = random.seed(datetime.now())

In [21]:
knn = KNeighborsClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

knn.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = knn.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = knn.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 186/191
test accuracy: 0.9738219895287958
# of correct train predictions: 186/190
train accuracy: 0.9789473684210527


In [22]:
OOS_pred = knn.predict(X_oos.reset_index().drop(['Reaction'], axis = 1))

OOS_pred

array(['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'E', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z',
       'Z', 'Z', 'Z', 'Z', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E'],
      dtype=object)

In [20]:
decision_tree = DecisionTreeClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

decision_tree.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = decision_tree.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = decision_tree.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 185/191
test accuracy: 0.9685863874345549
# of correct train predictions: 190/190
train accuracy: 1.0


In [21]:
random_forest = RandomForestClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

random_forest.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = random_forest.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = random_forest.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 185/191
test accuracy: 0.9685863874345549
# of correct train predictions: 188/190
train accuracy: 0.9894736842105263


In [22]:
log_reg = linear_model.LogisticRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

log_reg.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = log_reg.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = log_reg.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 182/191
test accuracy: 0.9528795811518325
# of correct train predictions: 187/190
train accuracy: 0.9842105263157894


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
lda = LinearDiscriminantAnalysis()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

lda.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = lda.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = lda.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 178/191
test accuracy: 0.9319371727748691
# of correct train predictions: 184/190
train accuracy: 0.968421052631579


In [24]:
clf = svm.SVC()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

clf.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
Y_pred = clf.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
results.set_index('Reaction', inplace = True)
results.columns = ['Actual', 'Predicted']

Y_train_pred = clf.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
train_results.set_index('Reaction', inplace = True)
train_results.columns = ['Actual', 'Predicted']

count = 0
for reaction in results.index:
    if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
        count += 1

print('# of correct test predictions: ' + str(count) + '/' + str(len(Y_test)))
print('test accuracy: ' + str(count/len(Y_test)))

train_count = 0
for reaction in train_results.index:
    if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
        train_count += 1

print('# of correct train predictions: ' + str(train_count) + '/' + str(len(Y_train)))
print('train accuracy: ' + str(train_count/len(Y_train)))

# of correct test predictions: 110/191
test accuracy: 0.5759162303664922
# of correct train predictions: 112/190
train accuracy: 0.5894736842105263


###############################################################################################################################

Run Model 100 times

###############################################################################################################################

In [20]:
def run_knn(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        knn = KNeighborsClassifier()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        knn.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = knn.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = knn.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [21]:
def run_decision_tree(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        decision_tree = DecisionTreeClassifier()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        decision_tree.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = decision_tree.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = decision_tree.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [26]:
def run_random_forest(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        random_forest = RandomForestClassifier()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        random_forest.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = random_forest.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = random_forest.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        #evaluating most important features
        features = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(random_forest.feature_importances_)], axis = 1)
        features.columns = ['Feature', 'Importance']
        features.set_index('Feature', inplace = True)
        
        if i == 0:
            total_features = features.sort_values(by = ['Importance'], ascending = False)
        else:
            for feature in total_features.index:
                total_features.loc[feature, 'Importance'] += features.loc[feature, 'Importance']
    
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    #which stage each important feature is in
    for feature in total_features.index:
        if feature in solvent_columns:
            total_features.loc[feature, 'type'] = 'solvent'
        elif feature in nucleophile_columns:
            total_features.loc[feature, 'type'] = 'nucleophile'
        elif feature in catalyst_columns:
            total_features.loc[feature, 'type'] = 'catalyst'
        elif feature in iminium_columns:
            total_features.loc[feature, 'type'] = 'iminium'

    total_features = total_features[['type', 'Importance']]
    total_features = total_features.sort_values(by = ['Importance'], ascending = False)
    
    return scores, total_features

In [23]:
def run_log_reg(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        log_reg = linear_model.LogisticRegression()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        log_reg.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = log_reg.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = log_reg.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1

        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1

        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [24]:
def run_lda(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        lda = LinearDiscriminantAnalysis()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        lda.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        Y_pred = lda.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = lda.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1
                
        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1
        
        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [25]:
def run_clf(iterations):
    scores = pd.DataFrame(columns = ['iteration', 'test acc', 'train acc', 'total acc'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        #develop model & scores
        clf = svm.SVC()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

        clf.fit(np.array(X_train.reset_index().drop('Reaction', axis = 1)), np.array(Y_train.reset_index().drop('Reaction', axis = 1)).reshape(-1))
        
        #evaluating performance
        Y_pred = clf.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = clf.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        count = 0
        for reaction in results.index:
            if results.loc[reaction, 'Actual'] == results.loc[reaction, 'Predicted']:
                count += 1
                
        train_count = 0
        for reaction in train_results.index:
            if train_results.loc[reaction, 'Actual'] == train_results.loc[reaction, 'Predicted']:
                train_count += 1
        
        scores = scores.append({'iteration': i+1,
                    'test acc': count / len(Y_test),
                    'train acc': train_count / len(Y_train),
                    'total acc': (count + train_count) / (len(Y_test) + len(Y_train))
                               }, ignore_index = True)
        
    #add average for each row
    averages = pd.DataFrame(scores.mean(axis = 0)).T
    scores = scores.append({'iteration': 'average',
                           'test acc': averages.iloc[0,1],
                           'train acc': averages.iloc[0,2],
                           'total acc': averages.iloc[0,3]}, ignore_index = True)
    scores.set_index('iteration')
    
    return scores

In [26]:
scores = run_knn(100)
print(scores)
print(scores.std())

    iteration  test acc  train acc  total acc
0           1  0.968586   0.984211   0.976378
1           2  0.963351   0.984211   0.973753
2           3  0.958115   0.989474   0.973753
3           4  0.958115   0.989474   0.973753
4           5  0.963351   0.984211   0.973753
..        ...       ...        ...        ...
96         97  0.968586   0.978947   0.973753
97         98  0.968586   0.978947   0.973753
98         99  0.963351   0.984211   0.973753
99        100  0.958115   0.989474   0.973753
100   average  0.970366   0.977737   0.974042

[101 rows x 4 columns]
test acc     0.008219
train acc    0.006897
total acc    0.003359
dtype: float64


In [27]:
scores = run_decision_tree(100)
print(scores)
print(scores.std())

    iteration  test acc  train acc  total acc
0           1  0.968586   0.989474   0.979003
1           2  0.973822   0.978947   0.976378
2           3  0.963351   0.989474   0.976378
3           4  0.947644   1.000000   0.973753
4           5  0.958115   0.994737   0.976378
..        ...       ...        ...        ...
96         97  0.963351   0.994737   0.979003
97         98  0.963351   1.000000   0.981627
98         99  0.931937   1.000000   0.965879
99        100  0.963351   0.989474   0.976378
100   average  0.960000   0.993053   0.976483

[101 rows x 4 columns]
test acc     0.011315
train acc    0.005464
total acc    0.005416
dtype: float64


In [27]:
score, feature = run_random_forest(100)
print(score)
print(score.std())

    iteration  test acc  train acc  total acc
0           1  0.958115   1.000000   0.979003
1           2  0.952880   1.000000   0.976378
2           3  0.958115   1.000000   0.979003
3           4  0.968586   1.000000   0.984252
4           5  0.958115   0.989474   0.973753
..        ...       ...        ...        ...
96         97  0.979058   0.994737   0.986877
97         98  0.963351   0.989474   0.976378
98         99  0.968586   0.989474   0.979003
99        100  0.973822   0.994737   0.984252
100   average  0.970838   0.993000   0.981890

[101 rows x 4 columns]
test acc     0.009774
train acc    0.005720
total acc    0.004075
dtype: float64


In [34]:
print(sum(feature.head(30).loc[:, 'Importance']))
feature.head(60)

53.204073532904324


Unnamed: 0_level_0,type,Importance
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Polarizability (nucleophile),nucleophile,5.53762
Nu,nucleophile,4.984004
HOMO (nucleophile),nucleophile,4.716262
H-X-Nu,nucleophile,4.428404
H-X-CNu,nucleophile,2.924179
B1,nucleophile,2.448272
L,nucleophile,2.252715
nXH,nucleophile,2.247341
iXH,nucleophile,2.14017
bond distance H-X,nucleophile,1.901202


In [35]:
scores = run_log_reg(100)
print(scores)
print(scores.std())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.942408,0.994737,0.968504
1,2,0.937173,0.957895,0.947507
2,3,0.947644,0.968421,0.958005
3,4,0.958115,0.963158,0.960630
4,5,0.968586,0.984211,0.976378
...,...,...,...,...
96,97,0.947644,0.984211,0.965879
97,98,0.958115,0.978947,0.968504
98,99,0.968586,0.973684,0.971129
99,100,0.958115,0.963158,0.960630


In [29]:
scores = run_lda(100)
print(scores)
print(scores.std())

    iteration  test acc  train acc  total acc
0           1  0.963351   0.984211   0.973753
1           2  0.973822   0.984211   0.979003
2           3  0.958115   0.978947   0.968504
3           4  0.968586   0.994737   0.981627
4           5  0.910995   0.973684   0.942257
..        ...       ...        ...        ...
96         97  0.963351   0.989474   0.976378
97         98  0.931937   0.984211   0.958005
98         99  0.984293   0.989474   0.986877
99        100  0.931937   0.963158   0.947507
100   average  0.954555   0.983947   0.969213

[101 rows x 4 columns]
test acc     0.021071
train acc    0.009036
total acc    0.012603
dtype: float64


In [37]:
run_clf(100)

Unnamed: 0,iteration,test acc,train acc,total acc
0,1,0.591623,0.573684,0.582677
1,2,0.586387,0.578947,0.582677
2,3,0.570681,0.594737,0.582677
3,4,0.554974,0.610526,0.582677
4,5,0.554974,0.610526,0.582677
...,...,...,...,...
96,97,0.607330,0.557895,0.582677
97,98,0.628272,0.536842,0.582677
98,99,0.586387,0.578947,0.582677
99,100,0.549738,0.615789,0.582677
