In [2]:
import pandas as pd
import numpy as np

file_name = "../datasets/final_clothing_store.csv"

data = pd.read_csv(file_name)

data

Unnamed: 0,TOTAL_VISITS,TOTAL_SPENT,AVRG_SPENT_PER_VISIT,HAS_CREDIT_CARD,PSWEATERS,PKNIT_TOPS,PKNIT_DRES,PBLOUSES,PJACKETS,PCAR_PNTS,...,CLUSTYPE_8,CLUSTYPE_15,CLUSTYPE_11,CLUSTYPE_18,CLUSTYPE_5,CLUSTYPE_23,CLUSTYPE_38,CLUSTYPE_3,CLUSTYPE_12,CLUSTYPE_-1
0,0.008772,0.016324,0.095728,0.0,0.18,0.00,0.00,0.30,0.00,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.026316,0.011417,0.033349,1.0,0.26,0.16,0.00,0.00,0.00,0.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.008772,0.003377,0.019803,0.0,1.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.061404,0.037541,0.054840,1.0,0.38,0.00,0.05,0.06,0.20,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.003840,0.045301,0.0,0.20,0.20,0.00,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21735,0.008772,0.001689,0.009904,0.0,0.00,0.00,0.39,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21736,0.043860,0.012421,0.024107,0.0,0.02,0.00,0.00,0.06,0.22,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21737,0.070175,0.056594,0.073549,0.0,0.29,0.04,0.01,0.14,0.23,0.12,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21738,0.017544,0.011770,0.045926,0.0,0.18,0.00,0.03,0.11,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## KNN Classifier

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

X = data.drop(columns = ['RESP'])
Y = data['RESP'].values

knn = KNeighborsClassifier(weights="distance")

grid_dictionary = {'n_neighbors': np.array([s for s in range(1, 36)])}

cv = GridSearchCV(knn, grid_dictionary, cv=10)

cv.fit(X, Y)

print(cv.best_params_)

{'n_neighbors': 35}


In [22]:
# continue running with n_neighbors > 35, since a maximum hasn't been discovered yet
grid_dictionary = {'n_neighbors': np.array([s for s in range(36, 50)])}

cv = GridSearchCV(knn, grid_dictionary, cv=10)

cv.fit(X, Y)

print(cv.best_params_)

{'n_neighbors': 49}


In [23]:
# and ..... continue running with n_neighbors > 49, since a maximum still hasn't been discovered yet
grid_dictionary = {'n_neighbors': np.array([s for s in range(50, 75)])}

cv = GridSearchCV(knn, grid_dictionary, cv=10)

cv.fit(X, Y)

print(cv.best_params_)

{'n_neighbors': 61}


In [51]:
knn_cv = KNeighborsClassifier(n_neighbors=61, weights="distance")

knn_cv_scores = cross_val_score(knn_cv, X, Y, cv=10)

print(knn_cv_scores)

knn_acc = np.mean(knn_cv_scores)
print('cv_scores mean: {}'.format(knn_acc))

[0.82888684 0.83440662 0.83670653 0.83762649 0.84176633 0.83164673
 0.82704692 0.83026679 0.83578657 0.8325667 ]
cv_scores mean: 0.8336706531738731


## Logistic Regression

In [73]:
from sklearn import linear_model

logregr = linear_model.LogisticRegression(solver='liblinear')
logreg_cv_scores = cross_val_score(logregr, X, Y, cv=10)

print(logreg_cv_scores)

logreg_acc = np.mean(logreg_cv_scores)
print('cv_scores mean: {}'.format(logreg_acc))

[0.84636615 0.8449862  0.85372585 0.84590616 0.85142594 0.83394664
 0.84406624 0.83854646 0.84222631 0.84682613]
cv_scores mean: 0.8448022079116836


## Comparing Classifiers

Let's compute the confidence interval of both classifiers with confidence level: 90%.

In [74]:
import math

# knn classifier
sample_mean = knn_acc

num_correctly_classified = knn_acc * len(data)
num_incorrectly_classified = len(data) - num_correctly_classified
sample_variance = ((num_correctly_classified * ((1 - knn_acc) ** 2)) + (num_incorrectly_classified * ((0 - knn_acc) ** 2)))/(len(data) - 1)

sample_st_dev = math.sqrt(sample_variance)

true_st_dev = sample_st_dev / math.sqrt(len(data))

z_value = 1.64 # corresponding to confidence level 90%, from Z-table

confidence_interval_knn = [sample_mean - (z_value * (true_st_dev)), sample_mean + (z_value * (true_st_dev))]
print(confidence_interval_knn)

[0.829528694469032, 0.8378126118787143]


In [75]:
import math

# logistic regression classifier 
sample_mean = logreg_acc

num_correctly_classified = logreg_acc * len(data)
num_incorrectly_classified = len(data) - num_correctly_classified
sample_variance = ((num_correctly_classified * ((1 - logreg_acc) ** 2)) + (num_incorrectly_classified * ((0 - logreg_acc) ** 2)))/(len(data) - 1)

sample_st_dev = math.sqrt(sample_variance)

true_st_dev = sample_st_dev / math.sqrt(len(data))

z_value = 1.64 # corresponding to confidence level 90%, from Z-table

confidence_interval_logreg = [sample_mean - (z_value * (true_st_dev)), sample_mean + (z_value * (true_st_dev))]
print(confidence_interval_logreg)

[0.8407746262372428, 0.8488297895861244]


Now that we know the confidence intervals of both classifiers with a confidence level of 90%, let's determine whether or not the null hypothesis is true, that is: if there truly is a difference between these two classifiers (ie, the difference in accuracy is not due to random chance).

In [76]:
differences = []
for i in range(10):
    differences.append(knn_cv_scores[i] - logreg_cv_scores[i])
print(differences)

mean = 0
for x in differences:
    mean += x

mean = mean / len(differences)
print(mean)

[-0.017479300827966893, -0.010579576816927339, -0.017019319227230878, -0.008279668813247487, -0.00965961361545531, -0.0022999080036798514, -0.01701931922723099, -0.008279668813247376, -0.0064397424103035394, -0.014259429622815012]
-0.011131554737810467


In [77]:
variance = 0
for x in differences:
    variance += ((mean - x) ** 2) / (10 - 1)

st_dev = math.sqrt(variance)

true_st_dev = st_dev / math.sqrt(10)

t_value = 1.86 # corresponding to a confidence level of 90%, and 8 degrees of freedom, from T-table

average_difference_interval = [-t_value * true_st_dev, t_value * true_st_dev]
print(average_difference_interval)

[-0.0030256883422713618, 0.0030256883422713618]


Our mean is approximately equal to -0.011, which is less than the lower bound of the interval. Therefore, the null hypothesis is false, and our second classifier, logistic regression, is significantly better than the first, knn. 

## Methods that I used to improve our classifiers

To improve my knn classifier, I ran the classifier with 10-fold cross validation with k ranging from 1 to 75 to find the optimal value of k. I discovered that the optimal value of k was equal to 61, and went with that as the final classifier model. Furthermore, I also found that weighing closer data points more heavily than further ones gave me a better accuracy, so I included that in my final model as well. 

To improve my logistic regression classifier, I ran the classifier with different solver functions, ie : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, taken from the sklearn documentation. I discovered that using the 'liblinear' solver gave me the highest accuracy, so I went with that as the final classifier model

## Potential methods of improving our classifiers 

One way in which we can improve our classifiers is to convert our two classifiers (of which are discrete, ie: they only output a class label when fed a data entry) into probabilistic classifiers (ie: they output a class label as well as the probability of the data entry belonging to that class). Then, we can test the probability threshold (ie: the cut-off probability for the data entry to be classified as one class as opposed to another) at different values, and use the value that gives us the greatest accuracy for the classifier. 

This should be fairly simple for our Logistic Regression classifier. Logistic Regression aims to find 𝑝 = 𝑠𝑖𝑔𝑚𝑜𝑖𝑑(𝑎𝑥+𝑏), the probability of x belonging to a certain class, by maximizing the objective function: Likelihood. So what we need to do in Logistic Regression to convert it into a probabilistic classifier is to output p as well as the class label for each datapoint. 

This task of converting discrete classifiers into probabilistic classifiers is more difficult for our knn classifier. The distance function that I've employed in knn is euclidean distance (as well as giving more weight to those points that are closer to the one in question). One possible method we can try is this: When we're evaluating whether a data entry should be classified as yes or no, we calculate the weighted sum of the distances of r datapoints that are classified as yes, call it weight_yes and we calculate the weighted sum of the distances of t datapoints that are classified as no, call it weight_no, where r + t = k. The classifier then classifies the datapoint as the class which corresponds to the sum with less distance, ie: more similarity. When doing this, we can also output the proportions of the sums, so if a data point is classified as yes, we output (along with the class label: yes),
weight_yes / (weight_yes + weight_no). This can then be viewed as the probability of it belonging to class: yes. If all k-points nearest to the data point is class: yes, then weight_no = 0, and the probability would equal 1, ie: it is 100% that the class label is correct. 

## Creating the input dataset for the Lift Chart

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
logregr.fit(X_train, y_train)


LogisticRegression(solver='liblinear')

In [90]:
arr = logregr.predict(X_test)

logregr.score(X_test, y_test)

0.8494773519163763

In [87]:
data = {'Instance': [s for s in range(1, 7176)], 'Actual Class': y_test, 'Predicted Class': arr}
df = pd.DataFrame(data)

df

Unnamed: 0,Instance,Actual Class,Predicted Class
0,1,0.0,0.0
1,2,0.0,0.0
2,3,0.0,0.0
3,4,0.0,0.0
4,5,0.0,0.0
...,...,...,...
7170,7171,1.0,0.0
7171,7172,0.0,0.0
7172,7173,0.0,0.0
7173,7174,0.0,0.0


In [89]:
df.to_csv(index = False, path_or_buf = '../datasets/lift_data.csv')
