In [2]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, accuracy_score, classification_report, log_loss
from math import sqrt

In [3]:
%matplotlib inline
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
sns.set(rc={'figure.figsize':(12,10)})

In [4]:
# Loading the data
df = pd.read_csv('C:\Work\Work\Data\CVA_explore.csv')

In [8]:
#Drop most null columns or completely irrelevant or obviously multicolineary
data = df.drop(['C3_PSPotential123'
                ,'C4_PSPotentialABC'
                ,'C7_Average_Days_To_Pay'
                ,'C3_PSPotential123'
                ,'C4_PSPotentialABC'
               ,'C13_Cusomter_Preferred_Contract_Type'
               ,'C11_Customer_Prefered_Contract_Duration_Hours'
               ,'C12_Customer_Prefered_Contract_Duration_Months'
                ,'C14_Have_Signed_Contract_Before'
                ,'C16_Average_Contract_Value'
                ,'M20_AnnualUsage'
                ,'M24_Months_Until_Contract_Ends'
                ,'M26_Original_Contract_Length_In_Months'
                ,'M27_Original_Contract_Duration_In_Hours'
                ,'M28_Original_Contract_Type'
                ,'M29_Contract_Value'
                ,'M31_Average_Contract_Value_BaseModel_PM'
                ,'M32_Average_Contract_Value_BaseModel_MC'
                ,'M38_Average_Contract_Value_By_Months_PM'
                ,'M39_Average_Contract_Value_By_Months_MC'
                ,'M40_Average_Contract_Value_By_Hours_PM'
                ,'M41_Average_Contract_Value_By_Hours_MC'
                ,'M42_Estimated_Contract_Value_Customer_Preference'
               ,'New_CVA_Lead'
               ,'Renewal_CVA_Lead']
               ,axis=1)
null_columns = data.isnull().sum()
print(null_columns[null_columns>0])
data = data.dropna()
data.shape

C1_PartsDCAL                124
C2_ServiceDCAL              124
M18_Age_In_Years            103
M19_LTDSMU                  177
DI4M                          1
WWM                           1
DIM                           1
Account                       1
PS_PotentialABC_High          1
PS_PotentialABC_Medium        1
PS_PotentialABC_Low           1
PS_PotentialABC_Inactive      1
dtype: int64


(10593, 30)

In [9]:
# Split the data into X & y
Objective = data['M17_Contract_Active']

X = data.drop('M17_Contract_Active', axis = 1).values
y = data['M17_Contract_Active']
y = y.astype(int)
print(X.shape)
print(y.shape)

(10593, 29)
(10593,)


In [10]:
# Run a Tree-based estimators (i.e. decision trees & random forests)
dt = DecisionTreeClassifier(random_state=15, criterion = 'entropy', max_depth = 10)
dt.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=15, splitter='best')

In [11]:
# Running Feature Importance
fi_col = []
fi = []

for i,column in enumerate(data.drop('M17_Contract_Active', axis = 1)):
    print('The feature importance for {} is : {}'.format(column, dt.feature_importances_[i]))
    
    fi_col.append(column)
    fi.append(dt.feature_importances_[i])

The feature importance for EquipmentId is : 0.029102252662305945
The feature importance for C1_PartsDCAL is : 0.00687814343907175
The feature importance for C2_ServiceDCAL is : 0.005548401061859694
The feature importance for C8_Percentage_Parts_Purchased_Direct is : 0.008476125479081931
The feature importance for C9_Percentage_Labour_Purchased_PSTotal is : 0.009818351788334925
The feature importance for C10_Percentage_Parts_Purchased_ServiceTotal is : 0.01188663256442969
The feature importance for C15_Percentage_Equipment_Currently_On_Contract is : 0.52807150508362
The feature importance for M18_Age_In_Years is : 0.014890116235347717
The feature importance for M19_LTDSMU is : 0.024932249902940344
The feature importance for M21_Potentially_Disposed is : 0.0
The feature importance for M22_PLActive is : 0.19868367438239598
The feature importance for M23_OLGAAmount is : 0.014775192892310997
The feature importance for M25_Months_Until_Warranty_Ends is : 0.007684170757778557
The feature impo

In [12]:
# Creating a Dataframe
fi_col
fi

fi_df = zip(fi_col, fi)
fi_df = pd.DataFrame(fi_df, columns = ['Feature','Feature Importance'])
fi_df


# Ordering the data
fi_df = fi_df.sort_values('Feature Importance', ascending = False).reset_index()

# Creating columns to keep
columns_to_keep = fi_df['Feature'][0:40]

fi_df

Unnamed: 0,index,Feature,Feature Importance
0,6,C15_Percentage_Equipment_Currently_On_Contract,0.528072
1,10,M22_PLActive,0.198684
2,16,M34_Percentage_Parts_Purchased_ServiceTotal,0.039498
3,19,M37_SOS_Count,0.029151
4,0,EquipmentId,0.029102
5,8,M19_LTDSMU,0.024932
6,13,M30_Number_Of_Contracts_Under_Base_Model,0.022759
7,7,M18_Age_In_Years,0.01489
8,11,M23_OLGAAmount,0.014775
9,15,M33_Percentage_Labour_Purchased_PSTotal,0.014533


In [None]:
# Print the shapes

print(data.shape)
print(data[columns_to_keep].shape)

# new_raw_data = new_raw_data[columns_to_keep]

In [None]:
data.head()

In [None]:
# Split the data into X & y

X = data[columns_to_keep].values
X

y = data['M17_Contract_Active']
y = y.astype(int)
y

print(X.shape)
print(y.shape)

In [None]:
# Hold-out validation

# first one
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size=0.2, random_state=15)

# Second one
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = 0.9, test_size=0.1, random_state=15)

print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)

print(y_train.shape)
print(y_test.shape)
print(y_valid.shape)

# Official Doc: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# Investigating the distr of all ys

ax = sns.countplot(x = y_valid, palette = "Set3")

## 8. Running Logistic Regression

In [None]:
# Training my model

log_reg = LogisticRegression(random_state=10, solver = 'lbfgs')

log_reg.fit(X_train, y_train)

# SKLearn doc: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
# Methods we can use in Logistic

# predict - Predict class labels for samples in X
log_reg.predict(X_train)
y_pred = log_reg.predict(X_train)

# predict_proba - Probability estimates
pred_proba = log_reg.predict_proba(X_train)

# coef_ - Coefficient of the features in the decision function
log_reg.coef_

# score- Returns the mean accuracy on the given test data and labels - below

## 9. Evaluating the Model

In [None]:
# Accuracy on Train
print("The Training Accuracy is: ", log_reg.score(X_train, y_train))

# Accuracy on Test
print("The Testing Accuracy is: ", log_reg.score(X_test, y_test))


# Classification Report
print(classification_report(y_train, y_pred))



In [None]:
# Confusion Matrix function

def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, cmap="YlGnBu", xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':25})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Visualizing cm

cm = confusion_matrix(y_train, y_pred)
cm_norm = cm / cm.sum(axis=1).reshape(-1,1)

plot_confusion_matrix(cm_norm, classes = log_reg.classes_, title='Confusion matrix')

In [None]:
log_reg.classes_

In [None]:
cm

In [None]:
cm.sum(axis=0)

In [None]:
np.diag(cm)

In [None]:
# Calculating False Positives (FP), False Negatives (FN), True Positives (TP) & True Negatives (TN)

FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)


# Sensitivity, hit rate, recall, or true positive rate
TPR = TP / (TP + FN)
print("The True Positive Rate is:", TPR)

# Precision or positive predictive value
PPV = TP / (TP + FP)
print("The Precision is:", PPV)

# False positive rate or False alarm rate
FPR = FP / (FP + TN)
print("The False positive rate is:", FPR)


# False negative rate or Miss Rate
FNR = FN / (FN + TP)
print("The False Negative Rate is: ", FNR)



##Total averages :
print("")
print("The average TPR is:", TPR.sum()/2)
print("The average Precision is:", PPV.sum()/2)
print("The average False positive rate is:", FPR.sum()/2)
print("The average False Negative Rate is:", FNR.sum()/2)


In [None]:
# Running Log loss on training
print("The Log Loss on Training is: ", log_loss(y_train, pred_proba))

# Running Log loss on testing
pred_proba_t = log_reg.predict_proba(X_test)
print("The Log Loss on Testing Dataset is: ", log_loss(y_test, pred_proba_t))


In [None]:
np.geomspace(1e-5, 1e5, num=20)

In [None]:
# Creating a range for C values
np.geomspace(1e-5, 1e5, num=20)

# ploting it
plt.plot(np.geomspace(1e-5, 1e5, num=20)) #  uniformly distributed in log space
plt.plot(np.linspace(1e-5, 1e5, num=20)) # uniformly distributed in linear space, instead of log space
# plt.plot(np.logspace(np.log10(1e-5) , np.log10(1e5) , num=20)) # same as geomspace


In [None]:
# Looping over the parameters

C_List = np.geomspace(1e-5, 1e5, num=20)
CA = []
Logarithmic_Loss = []

for c in C_List:
    log_reg2 = LogisticRegression(random_state=10, solver = 'lbfgs', C=c)
    log_reg2.fit(X_train, y_train)
    score = log_reg2.score(X_test, y_test)
    CA.append(score)
    print("The CA of C parameter {} is {}:".format(c, score))
    pred_proba_t = log_reg2.predict_proba(X_test)
    log_loss2 = log_loss(y_test, pred_proba_t)
    Logarithmic_Loss.append(log_loss2)
    print("The Logg Loss of C parameter {} is {}:".format(c, log_loss2))
    print("")

In [None]:
# putting the outcomes in a Table

# reshaping
CA2 = np.array(CA).reshape(20,)
Logarithmic_Loss2 = np.array(Logarithmic_Loss).reshape(20,)

# zip
outcomes = zip(C_List, CA2, Logarithmic_Loss2)

#df
df_outcomes = pd.DataFrame(outcomes, columns = ["C_List", 'CA2','Logarithmic_Loss2'])

#print
df_outcomes

# Ordering the data (sort_values)
df_outcomes.sort_values("Logarithmic_Loss2", ascending = True).reset_index()

In [None]:
# Another way of doing the above
# Scikit-learn offers a LogisticRegressionCV module which implements Logistic Regression 
# with builtin cross-validation to find out the optimal C parameter

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, random_state=0, shuffle=True)

# Logistic Reg CV
Log_reg3 = LogisticRegressionCV(random_state=15, Cs = C_List, solver ='lbfgs')
Log_reg3.fit(X_train, y_train)
print("The CA is:", Log_reg3.score(X_test, y_test))
pred_proba_t = Log_reg3.predict_proba(X_test)
log_loss3 = log_loss(y_test, pred_proba_t)
print("The Logistic Loss is: ", log_loss3)

print("The optimal C parameter is: ", Log_reg3.C_)



# Doc: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html

In [None]:
# Maybe we have a different metric we want to track

# Looping over the parameters

C_List = np.geomspace(1e-5, 1e5, num=20)
CA = []
Logarithmic_Loss = []

for c in C_List:
    log_reg2 = LogisticRegression(random_state=10, solver = 'lbfgs', C=c)
    log_reg2.fit(X_train, y_train)
    score = log_reg2.score(X_test, y_test)
    CA.append(score)
    print("The CA of C parameter {} is {}:".format(c, score))
    pred_proba_t = log_reg2.predict_proba(X_test)
    log_loss2 = log_loss(y_test, pred_proba_t)
    Logarithmic_Loss.append(log_loss2)
    print("The Logg Loss of C parameter {} is {}:".format(c, log_loss2))
    print("")
    
    y_pred = log_reg2.predict(X_train)
    cm = confusion_matrix(y_train, y_pred)
    cm_norm = cm / cm.sum(axis=1).reshape(-1,1)
    plot_confusion_matrix(cm_norm, classes = log_reg.classes_, title='Confusion matrix')
    plt.show()

In [None]:
# Training a Dummy Classifier

from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
score = dummy_clf.score(X_test, y_test)

pred_proba_t = dummy_clf.predict_proba(X_test)
log_loss2 = log_loss(y_test, pred_proba_t)

print("Testing Acc:", score)
print("Log Loss:", log_loss2)


# Doc: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

## 11. Final Model with Selected Parameters

In [None]:
# Final Model 

log_reg3 = LogisticRegression(random_state=10, solver = 'lbfgs', C=784.759970)
log_reg3.fit(X_train, y_train)
score = log_reg3.score(X_valid, y_valid)

pred_proba_t = log_reg3.predict_proba(X_valid)
log_loss2 = log_loss(y_valid, pred_proba_t)

print("Testing Acc:", score)
print("Log Loss:", log_loss2)