In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score


In [2]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

In [3]:
cat_cols = []

for col in train_df.columns:
    cat_cols.append(col)
    
cat_cols = cat_cols[1:20]

label_encoder = LabelEncoder()

for col in cat_cols:
    #test_df[col] = label_encoder.fit_transform(test_df[col])
    train_df[col] = label_encoder.fit_transform(train_df[col])
    
# set random seed to reproduce
np.random.seed(42)

# shuffle the dataset and drop the id column
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df = train_df.drop(columns='id', axis=1)

# split the data
X = train_df.drop('target', axis=1)
y = train_df.target

# split the data into train, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
models = {
    'Logistic Regression': LogisticRegression(),
    'KNeighborsClassifier' : KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

In [5]:
def fit_and_score(models, X_train, X_test, y_train, y_test):
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test, y_test)        
    return model_scores

In [6]:
model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)
model_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': 0.8266111111111111,
 'KNeighborsClassifier': 0.8109222222222222,
 'RandomForestClassifier': 0.8462666666666666}

In [23]:
# with these results, 
# we're then going to try using manually tuning their hyperparameters 

rf_grid = {
    'n_estimators' : np.arange(600, 1500, 75),
    'max_depth' : [None, 1,3,5],
    'max_features': ['auto'],
    'min_samples_split' : np.arange(1,15,1),
    'min_samples_leaf' : np.arange(1,15,1),
    'bootstrap' : [True ,False]
}

In [None]:
# Experimentation on RandomForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                          param_distributions=rf_grid,
                          cv=5,
                          n_iter=5,
                          verbose=2,
                          n_jobs=-1
                          )
rs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [None]:
# Experimentation on LogisticRegression
rs_lr = RandomizedSearchCV(LogisticRegression(),
                          param_distributions=rf_grid,
                          cv=5,
                          n_iter=5,
                          verbose=2,
                          n_jobs=3
                          )
rs_rf.fit(X_train, y_train)

In [None]:
# Experimentation on KNN
rs_kn = RandomizedSearchCV(KNeighborsClassifier(),
                          param_distributions=rf_grid,
                          cv=5,
                          n_iter=5,
                          verbose=2,
                          n_jobs=3
                          )
rs_kn.fit(X_train, y_train)

In [None]:
print(f'RF: {rs_rf.best_params_}, LR:{rs_lr.best_params_}, KN:{rs_kn.best_params_} ')

In [None]:
import pickle

pickle.dump(rs_rf, open('gs_random_random_forest_model.pkl', 'wb'))
pickle.dump(rs_lr, open('gs_random_logistic_regression_model.pkl', 'wb'))
pickle.dump(rs_kn, open('gs_random_k_neighors_model.pkl', 'wb'))