In [46]:
# import the necessary libraries

# numpy for numerical computations and arrays
import numpy as np
import pandas as pd

# for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# import the Support Vector Module
from sklearn.svm import SVC

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, recall_score, average_precision_score
from sklearn.metrics import roc_curve

# for tuning the model 
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

# import libraries for visualizing results
from matplotlib import pyplot as plt
import seaborn as sns

# import utility functions
#from Utils.evaluate import Evaluator


# Import the data

In [40]:
# use the scaled data
df = pd.read_csv('ProcessedData/train_scaled.csv')

# get the assessment data set
df_assess = pd.read_csv('Data/Surveydata_test.csv')

# import the assessment data 
X_assess_df = pd.read_csv('ProcessedData/X_test_scaled.csv')

# inspect the dataset
df.head()

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Arrival_Time_Convenient,...,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,Overall_Experience
0,-0.986183,0.469409,0.832354,-0.667882,-1.046703,-1.660469,-0.384168,-0.26125,-0.729514,1.347471,...,-1.152971,-0.397119,-1.127257,-1.176984,-0.387739,-1.467601,0.521801,-1.479684,-1.812778,0
1,1.01401,0.469409,0.5677,1.49727,0.95538,0.215099,-0.148115,-0.390872,-1.502953,1.347471,...,-1.949232,0.367413,0.405643,1.221517,-1.171161,-2.332568,-1.065508,0.25687,0.499214,0
2,-0.986183,0.469409,0.236883,-0.667882,-1.046703,-0.892926,1.635395,2.694135,-0.729514,-0.85108,...,0.439551,1.131945,1.172093,1.221517,1.179105,1.127302,0.521801,1.125147,1.269878,1
3,-0.986183,0.469409,0.303047,-0.667882,-1.046703,-1.166284,-0.043202,0.075767,0.043925,-0.85108,...,-1.152971,-0.397119,-0.360807,-0.377484,-0.387739,-0.602633,0.521801,-0.611407,-0.27145,0
4,-0.986183,0.469409,0.700027,-0.667882,-1.046703,0.002054,-0.384168,-0.390872,0.043925,-0.118229,...,0.439551,1.131945,0.405643,0.422017,0.395683,0.262334,0.521801,0.25687,0.499214,1


# Prepare the data

# Splitting Data into test and target sets

In [41]:
# split data into features and target

# class
Y = df['Overall_Experience']

# features
X = df.drop(columns='Overall_Experience')

# Splitting data into test and training sets

In [42]:
# split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

# Functions for model evaluation

In [43]:
# Creating metric function 
def metrics_score(model, x_test, y_test):
    
    y_pred = model.predict(x_test)
    
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8,5))
    
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Attrite', 'Attrite'], yticklabels=['Not Attrite', 'Attrite'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Model Building

## Model 1 (Linear Baseline)

In [25]:
#  initialize a SVM with linear decision boundaries
model1 = SVC(kernel='linear', class_weight={1: 0.55, 0: 0.45}, random_state=1)

# fit the data to the SVM
model1.fit(X=X_train, y=Y_train)

In [26]:
y_pred = model1.predict(X_test)

## Model 2 (RBF baseline)

In [32]:
# initialize the model 
model2 = SVC(kernel='rbf', class_weight={1: 0.55, 0: 0.45}, random_state=1)

# fit the data to the model 
model2.fit(X=X_train, y=Y_train)

Improved accuracy using the rbf basis function. No overfitting.

## Model 3 - Tuned SVM

In [None]:
# initialize the estimator to be tuned
model3 = SVC(class_weight={1: 0.55, 0: 0.45}, random_state=1)

# range of C values to test 
C_range = np.logspace(-2, 10, 13)

# range of gammas to test
gamma_range = np.logspace(-9, 3, 13)

# cross validation 
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# grid of params to choose from
params3 = dict(gamma=gamma_range, C=C_range)

# initalize the grid
grid = GridSearchCV(model3, param_grid=params3, cv=cv)

grid.fit(X_train, Y_train)

print(
    "The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_)
)