<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preparing-for-modeling" data-toc-modified-id="Preparing-for-modeling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preparing for modeling</a></span><ul class="toc-item"><li><span><a href="#Import-data" data-toc-modified-id="Import-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Train/Test-split" data-toc-modified-id="Train/Test-split-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Train/Test split</a></span></li></ul></li><li><span><a href="#Baseline-model:-Logistic-Regression" data-toc-modified-id="Baseline-model:-Logistic-Regression-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Baseline model: Logistic Regression</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression-by-default" data-toc-modified-id="Logistic-Regression-by-default-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Logistic Regression by default</a></span></li><li><span><a href="#Changing-the-inverse-of-regularization-strength" data-toc-modified-id="Changing-the-inverse-of-regularization-strength-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Changing the inverse of regularization strength</a></span></li></ul></li><li><span><a href="#K-Nearest-neighbors" data-toc-modified-id="K-Nearest-neighbors-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>K-Nearest neighbors</a></span><ul class="toc-item"><li><span><a href="#KNN-by-default" data-toc-modified-id="KNN-by-default-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>KNN by default</a></span></li><li><span><a href="#Changing-the-number-of-neighbors" data-toc-modified-id="Changing-the-number-of-neighbors-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Changing the number of neighbors</a></span></li><li><span><a href="#Hyper-parameters-tuning-with-grid-search" data-toc-modified-id="Hyper-parameters-tuning-with-grid-search-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Hyper-parameters tuning with grid search</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Preparing for modeling

### Import data 

In [2]:
loans_df = pd.read_csv('/Users/lasayin/Desktop/Springboard/GitHub/Springboard_DS_Capstone2/ProcessedData/loans_df.cvs')

### Train/Test split 

In [3]:
X = loans_df.drop(columns='funding_speed')
y = loans_df.funding_speed

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Baseline model: Logistic Regression

### Logistic Regression by default

In [7]:
# The default solver 'lbfgs' raise an AttributeError
lr = LogisticRegression(solver='liblinear')
scores_lr = cross_val_score(lr, X_train, y_train, cv=10)
average_lr = scores_lr.mean()

average_lr

0.7330984961527649

### Changing the inverse of regularization strength

In [8]:
C_param_range = [0.001,0.01,0.1,1,10,100]
lg_C_accuracy = pd.DataFrame(columns = ['C_parameter','Accuracy'])
lg_C_accuracy['C_parameter'] = C_param_range

index = 0
for i in C_param_range:
    lg = LogisticRegression(C = i, solver='liblinear', random_state = 42)
    scores = cross_val_score(lr, X_train, y_train, cv=5)
    lg_C_accuracy.iloc[index, 1] = scores.mean()
    index += 1
    
lg_C_accuracy

Unnamed: 0,C_parameter,Accuracy
0,0.001,0.733098
1,0.01,0.733098
2,0.1,0.733098
3,1.0,0.733098
4,10.0,0.733098
5,100.0,0.733098


## K-Nearest neighbors

### KNN by default

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
scores_knn = cross_val_score(knn, X_train, y_train, cv=5)
accuracy_knn = scores.mean()

accuracy_knn

### Changing the number of neighbors

In [10]:
knn_n_accuracy = {}

for k in range(1,30,2):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5)
    knn_n_accuracy[k] = scores.mean()
    
pd.Series(knn_n_accuracy).plot.bar(figsize=(9,6))
print(pd.DataFrame(knn_n_accuracy), columns=['n_neighbors', 'Accuracy'])

KeyboardInterrupt: 

### Hyper-parameters tuning with grid search

In [None]:
hyperparameters = {
    "n_neighbors": range(1,20,2),
    "weights": ["distance", "uniform"],
    "p": [1,2]
}

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid=hyperparameters, cv=10)
grid.fit(X_train, y_train)
best_params = grid.best_params_
best_score = grid.best_score_
best_knn = grid.best_estimator_
print(best_params)
print(best_score)
print(best_knn)