In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\johnt\anaconda3\lib\site-packages (0.0)


In [2]:
# Install joblib
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("DataOutgoingCalls.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,CallerDepts,Called,Accepted
0,ED,Jesus Lopez,Yes
1,ED,Juan Rendon,No
2,Emergency Medicine,ED Ward Clerk Area 1,Yes
3,Respiratory Therapy,Andre Richardson,Yes
4,ED,Juan Rendon,Yes


In [28]:
df["CallerDepts"] = df["CallerDepts"].astype("category").cat.codes
df["Called"] = df["Called"].astype("category").cat.codes
df["Accepted"] = df["Accepted"].astype("category").cat.codes

df.head()

Unnamed: 0,CallerDepts,Called,Accepted
191928,20,4695,1
110323,13,7221,1
148065,22,6951,1
171727,2,6051,1
78878,29,6651,1


In [6]:
import sklearn
from sklearn import svm
df = sklearn.utils.shuffle(df)
df = df.replace(',','', regex=True)
df.head()

Unnamed: 0,CallerDepts,Called,Accepted
191928,20,4695,1
110323,13,7221,1
148065,22,6951,1
171727,2,6051,1
78878,29,6651,1


# Select your features (columns)

In [7]:
# Set features. This will also be used as your x values.
feature = df.drop("Accepted", axis=1)

target = df["Accepted"]

print(feature.shape, target.shape)

(206051, 2) (206051,)


In [8]:
feature.head()

Unnamed: 0,CallerDepts,Called
191928,20,4695
110323,13,7221
148065,22,6951
171727,2,6051
78878,29,6651


# Create a Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, target, random_state=1000, stratify=target)

In [10]:
X_train.transform(pd.to_numeric, errors='coerce')
X_test.transform(pd.to_numeric, errors='coerce')
y_train.transform(pd.to_numeric, errors='coerce')
y_test.transform(pd.to_numeric, errors='coerce')

64836     1
198773    1
107919    1
170058    1
35521     1
         ..
205452    0
157100    0
197975    1
191490    1
189231    1
Name: Accepted, Length: 51513, dtype: int64

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [11]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Model



In [12]:
from sklearn.svm import SVC
model = SVC(kernel = "linear")
model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [13]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.7256597082918117
Testing Data Score: 0.7256614835090172


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
grid = {"C":[1,5,10],"gamma":[0.0001,0.001,0.01]}
grid2 = GridSearchCV(model, grid,verbose=3)
grid2

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [15]:
# Train the model with GridSearch
grid2.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.726, total= 1.7min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.726, total= 1.7min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.5min remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.726, total= 1.8min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.726, total= 1.6min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.726, total= 1.6min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.726, total= 1.7min
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.726, total= 1.7min
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.726, total= 1.7min
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.726, total= 2.6min
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 48.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [16]:
print(grid2.best_params_)
print(grid2.best_score_)

{'C': 1, 'gamma': 0.0001}
0.7256597082918117


In [17]:
predictions = grid2.predict(X_test_scaled)
predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
# Classificaiton report 
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     14132
           1       0.73      1.00      0.84     37381

    accuracy                           0.73     51513
   macro avg       0.36      0.50      0.42     51513
weighted avg       0.53      0.73      0.61     51513

Accuracy: 0.7256614835090172


  'precision', 'predicted', average, warn_for)


In [29]:
import matplotlib.pyplot as plt
import numpy as np
