In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB , BernoulliNB , MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

## Step 1: Load training and test sets

In [18]:
df_train = pd.read_csv (r'C:\Users\e135634\OneDrive - Blue Cross Blue Shield of Michigan\Desktop\Test Folder\mnist_train.csv')
df_test = pd.read_csv (r'C:\Users\e135634\OneDrive - Blue Cross Blue Shield of Michigan\Desktop\Test Folder\mnist_test.csv')

In [19]:
print("Training Set Shape:" , df_train.shape)
print("Test Set Shape:" , df_test.shape)

Training Set Shape: (59999, 785)
Test Set Shape: (9999, 785)


### Converting all values to integers in both datasets in order to run the analysis below

In [22]:
df_train = df_train.astype('int')
df_test = df_test.astype('int')

## Step 2: Use SVD to reduce the number of dimensions of the training data set so that it explains just above 90% of the total variance

### Scaling the feature data before reduction

In [23]:
X_Training = df_train.iloc[:,1:]
y_train = df_train.iloc[:, 0]
X_Testing = df_test.iloc[:,1:]
y_test = df_test.iloc[:, 0]

In [24]:
# testing different scaling methods to see which one gives me better result

scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range = (0,1))
X_train = scaler.fit_transform(X_Training)
X_test = scaler.fit_transform(X_Testing)

### Here, I will determine what the optimal n_components value will be to reduce the matrix by for Truncated SVD

In [53]:
X_sparse = csr_matrix(X_train)

In [54]:
tsvd = TruncatedSVD(n_components=X_sparse.shape[1]-1)
X_tsvd = tsvd.fit(X_train)

tsvd_var_ratios = tsvd.explained_variance_ratio_

def select_n_components(var_ratio, goal_var: float) -> int:

    total_variance = 0.0
    
    n_components = 0
    
    for explained_variance in var_ratio:
        
        total_variance += explained_variance
        
        n_components += 1
        
        if total_variance >= goal_var:
            break
            
    return n_components

In [55]:
opt_comp = select_n_components(tsvd_var_ratios, 0.90)
print(opt_comp)

236


### based on determination of optimal component parameter of 236, I will now reduce the training & test matrix using Truncated SVD method 

In [48]:
tsvd = TruncatedSVD(n_components = opt_comp)
tsvd.fit(X_train)

X_trainSVD = tsvd.transform(X_train)
X_trainSVD = X_trainSVD.astype('int')

print("X-Train SVD Shape:" , X_trainSVD.shape)
print("Y-Train Shape:" , y_train.shape)

X-Train SVD Shape: (59999, 236)
Y-Train Shape: (59999,)


In [49]:
tsvd.fit(X_test)

X_testSVD = tsvd.transform(X_test)
X_testSVD = X_testSVD.astype('int')

print("X-Test SVD Shape:" , X_testSVD.shape)
print("Y-Test Shape:" , y_test.shape)

X-Test SVD Shape: (9999, 236)
Y-Test Shape: (9999,)


## Step 3: Train generative classifiers (Naive Bayes and KNN) and discriminative classifier (multinomial logistic regression) on both the dimension reduced data set using SVD and the original data set (without dimension reduction) (60 Points)

### Logistic Regression

To fine-tune my hyper-parameters for logistic regression analysis, I will be using the pre-built package GridSearchCV. Based on the documentation in the Sklearn website, I will be looping through a combination of all of the hyper-parameters listed below to find which set of parameters gives me the best accuracy score. For penalty, I will focus only on l2 and none; for the solver, I will be excluding liblinear

***Disclaimer:*** *I ran this on another laptop of mine and imported the best parameters combination from there. I strongly recommend not running the next two chunks, since it will take a long time to complete.*

In [None]:
# first I want to try and fine tune some of my hyperparameters in my lr function to get optimal accuracy score

parameters_lr = {
    'penalty' : ['none' , 'l2'],
    'C'       : [100 , 10 , 1.0 , 0.1 , 0.01],
    'solver'  : ['newton-cg' , 'lbfgs' , 'sag' , 'saga']
}

cv = StratifiedKFold(n_splits = 10 , n_repeats = 3 , random_state = 1)

In [12]:
lr = LogisticRegression()
clf = GridSearchCV(lr,                   
                   param_grid = parameters_lr,  
                   scoring = 'accuracy',        
                   cv = cv)  

clf.fit(X_trainSVD , y_train)

Traceback (most recent call last):
  File "C:\Users\e135634\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\e135634\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\e135634\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\e135634\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\e135634\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='accuracy')

In [13]:
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :",clf.best_score_)

Tuned Hyperparameters : {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy : 0.9158155275879313


Based on the hyper-parameters found above, the best hyper-parameters I can use in my model are:

- penalty = l2
- C = 100
- solver = saga
- max_iter = 1000

I will now plug those hyper-parameters into the model below for my analysis

In [39]:
lr = LogisticRegression(C = 100, penalty = 'l2' , solver = 'saga' , multi_class = 'auto' , n_jobs = -1, max_iter = 1000, verbose = 2)
lr.fit(X_trainSVD , y_train)
predict_lr = lr.predict(X_testSVD)

print("accuracy:" , accuracy_score(y_test, predict_lr))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


max_iter reached after 651 seconds
accuracy: 0.46224622462246223


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 10.9min finished


Now, I will do the same thing on my non-truncated datasets to see what accuracy I get by passing the same parameters I used above. In the interest of time, I will use the same "best parameters" above and apply them below

In [40]:
lr = LogisticRegression(C = 100, penalty = 'l2' , solver = 'saga' , multi_class = 'auto' , n_jobs = -1, max_iter = 1000, verbose = 2)
lr.fit(X_train , y_train)
predict_lr = lr.predict(X_test)

print("accuracy:" , accuracy_score(y_test, predict_lr))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


max_iter reached after 2049 seconds
accuracy: 0.9251925192519251


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 34.1min finished


### Naive Bayes

For Naive Bayes, since there are fewer hyper-parameters that I can fine tune, Im going to try **two** different formulas:

- Gaussian Naive Bayes
- Bernoulli Naive Bayes


In [44]:
# first, I will run the models on the truncated data to see which model performs the best

gnb = GaussianNB()
gnb.fit(X_trainSVD , y_train)
predict_gnb = gnb.predict(X_testSVD)

print("accuracy for Gaussian:" , accuracy_score(y_test, predict_gnb))

bnb = BernoulliNB()
bnb.fit(X_trainSVD , y_train)
predict_bnb = bnb.predict(X_testSVD)

print("accuracy for Bernoulli:" , accuracy_score(y_test, predict_bnb))

accuracy for Gaussian: 0.33423342334233425
accuracy for Bernoulli: 0.27522752275227524


Now, I will do the same thing on my non-truncated datasets to see what accuracy I get by passing the same models I used above. 

In [45]:
gnb = GaussianNB()
gnb.fit(X_train , y_train)
predict_gnb = gnb.predict(X_test)

print("accuracy for Gaussian:" , accuracy_score(y_test, predict_gnb))

bnb = BernoulliNB()
bnb.fit(X_train , y_train)
predict_bnb = bnb.predict(X_test)

print("accuracy for Bernoulli:" , accuracy_score(y_test, predict_bnb))

accuracy for Gaussian: 0.1076107610761076
accuracy for Bernoulli: 0.8372837283728373


### K-Nearest Neighbors

Finally, I will look at using a lazy classifier, k-nearest neighbors to compare the accuracy of the truncated data vs. the regular data. As above, I will be using the GridSearchCV package to try and fine-tune my hyper-parameters in order to obtain the most optimal accuracy score that I can for both truncated and non-truncated data. 

In [34]:
parameters_knn = {
    'n_neighbors' : [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29],
    'weights'       : ['uniform' , 'distance'],
    'metric'  : ['euclidean' , 'manhattan' , 'minkowski']
}

cv = 10

In [35]:
knn = KNeighborsClassifier()
knneighbors = GridSearchCV(knn,                   
                   param_grid = parameters_knn,  
                   scoring = 'accuracy',        
                   cv = cv)  

knneighbors.fit(X_trainSVD , y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski'],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
                                         23, 25, 27, 29],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [36]:
print("Tuned Hyperparameters :", knneighbors.best_params_)
print("Accuracy :",knneighbors.best_score_)

Tuned Hyperparameters : {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Accuracy : 0.953466022114797


Based on the hyper-parameters found above, the best hyper-parameters I can use in my model are:

- n_neighbors = 5
- weights = distance
- metric = euclidean

I will now plug those hyper-parameters into the model below for my analysis

In [37]:
knn = KNeighborsClassifier(n_neighbors = 5 , metric = 'euclidean' , weights = 'distance')
knn.fit(X_trainSVD , y_train)
predict_knn = knn.predict(X_testSVD)

print("accuracy:" , accuracy_score(y_test, predict_knn))

accuracy: 0.49314931493149317


Now, I will do the same thing on my non-truncated datasets to see what accuracy I get by passing the same parameters I used above. In the interest of time, I will use the same "best parameters" above and apply them below

In [38]:
knn = KNeighborsClassifier(n_neighbors = 5 , metric = 'euclidean' , weights = 'distance')
knn.fit(X_train , y_train)
predict_knn = knn.predict(X_test)

print("accuracy:" , accuracy_score(y_test, predict_knn))

accuracy: 0.9438943894389439


## Conclusion

For some reason, when I ran the truncated data through the various models, it was producing a lower accuracy score than the non-truncated data. Based on everything I researched, I believe I did implement the TruncatedSVD according to SKlearn documentaion. Even when I ran the hyper-parameter tuning through GridSearchCV, I still had similar results. Obviously, there must be something that's missing in my truncation that isn't getting translated to my results. 

So, just looking at the regular/ untruncated data, I was surprised that I was able to get a higher accuracy score on knn model vs. Logistic Regression. This could be due to LR model having more hyper-parameters that could be fine-tuned a little better to get a slightly higher accuracy score. I was not surprised that Naive Bayes method was not the best; this may be due to the fact that all the probabilities of the classifier are independent from one another

**Table of Accuracy Scores** 

|      Model        | Trunc Data |Regular Data|
|:------------------|:----------:|:----------:|
|Logistic Regression|    46.2%   |    92.5%   |
|Naive Bayes        |    33.4%   |   *83.7%*  |
|KNN                |    49.3%   |    94.4%   |

*the accuracy score obtained from using a Bernoulli NB model*