In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer

### 1. Download the dataset(s) for your project. If a train set and a test set are not already available, randomly split the dataset into a train and a test set using stratified sampling so that 80% of the samples go to train set and 20% to test set.

In [2]:
data = pd.read_csv("wine.csv")
y = data['type']
X = data.loc[:, data.columns != 'type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### 2. Randomly split your train set into a validation and a new train set (called train set 2) such that the validation set contains 1/5 of the samples in original train set and the train set 2 contains the remaining. Use stratified sampling to assign features to train set 2 and validation set. This should ensure that your validation set contains samples from both classes (i.e. ciliary and non-ciliary with equal proportions)

In [3]:
X_train_two, X_validation, y_train_two, y_validation = train_test_split(X_train, y_train, 
                                                                        test_size=0.20, random_state=42)

### 3. Normalize features in your train set 2 and validation set using min-max scaling to interval [0,1]. For this purpose you can first normalize features in your train set 2 and use the same scaling coefficients to normalize validation set. Save the normalized versions as separate files. Repeat normalizing your original train set and use the same normalization coefficients to normalize the two test sets.

In [4]:
scaler = MinMaxScaler()

scaler.fit(X_train_two)
normalized_x_train_two = scaler.transform(X_train_two)
normalized_x_validation = scaler.transform(X_validation)

np.savetxt("normalized_x_train_two.csv", normalized_x_train_two, delimiter=",")
np.savetxt("normalized_x_validation.csv", normalized_x_validation, delimiter=",")

scaler.fit(normalized_x_train_two)
normalized_x_train = scaler.transform(X_train)
normalized_x_test = scaler.transform(X_test)
normalized_x_validation_with_orig = scaler.transform(X_validation)

  return self.partial_fit(X, y)


### 4. Perform a 10-fold cross-validation experiment for the random forest classifier on normalized and unnormalized versions of train set 2. You can set the number of trees to 100. Do you get better accuracy when you perform data normalization?

In [5]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_two, y_train_two)

unnormalized_accuracy = clf.score(X_validation, y_validation)

clf.fit(normalized_x_train_two, y_train_two)
normalized_accuracy = clf.score(normalized_x_validation, y_validation)

if unnormalized_accuracy > normalized_accuracy or unnormalized_accuracy == normalized_accuracy:
    print("No, I did not. Unnormalized Accuracy: {}, Normalized Accuracy: {}"
          .format(unnormalized_accuracy,normalized_accuracy))
else:
    print("Yes, I did. Unnormalized Accuracy: {}, Normalized Accuracy: {}"
          .format(unnormalized_accuracy,normalized_accuracy))

No, I did not. Unnormalized Accuracy: 0.931034482759, Normalized Accuracy: 0.931034482759


### 5. Perform a 10-fold cross-validation experiment on train set 2 that corresponds to the best performing normalization strategy (i.e. normalized or unnormalized) for the following classifiers: 

Logistic regression

k-nearest neighbor (with k=1)

Naïve Bayes

Decision tree

Random forest (number of trees=100)

SVM (RBF kernel C=1.0 gamma=0.125)

RBF network (number of clusters = 3)

Adaboost (number of iterations=10)

You can use default values for other hyper-parameters of the classifiers
Report the following accuracy measures for each of these classifiers: overall
accuracy, F-measure, sensitivity, specificity, precision, area under the ROC curve,
area under the precision recall curve, MCC scores. These will be cross-validation
accuracies.

In [35]:
models = []

models.append(("Logistic Regression",LogisticRegression()))
models.append(("K-Nearest Neighbour",KNeighborsClassifier(n_neighbors=1)))
models.append(("Naive Bayes",GaussianNB()))
models.append(("Decision Tree",DecisionTreeClassifier()))
models.append(("Random Forest",RandomForestClassifier(n_estimators=100)))
models.append(("Support Vector Machine-rbf",SVC(kernel="rbf", C=1,gamma=0.125)))
#models.append(("RBF Network:", RBFNetwork()))
models.append(("AdaBoostClassifier",AdaBoostClassifier()))

metrics = []

metrics.append(("Accuracy Score", accuracy_score, ""))
metrics.append(("F-Measure", f1_score, "weighted"))
metrics.append(("Sensitivity", recall_score, "weighted"))
#metrics.append(("Specificity", recall_score, "weighted"))
metrics.append(("Precision", precision_score, "weighted"))
#metrics.append(("Area Under ROC Curve", roc_auc_score, "weighted"))
#metrics.append(("Area Under Precision Recall Curve", precision_recall_curve, "weighted"))
metrics.append(("MCC", matthews_corrcoef, ""))

def dump_results(X, y):
    
    for metric_name, metric, avg in metrics:
        names = []
        results = []
        
        for name,model in models:
            kfold = KFold(n_splits=10, random_state=42)
            if metric_name == "Accuracy Score" or metric_name == "MCC":
                scorer = make_scorer(metric)
            else:
                scorer = make_scorer(metric, average=avg)
            
            cv_result = cross_val_score(model,X,y.values.ravel(), cv = kfold,scoring = scorer)
            names.append(name)
            results.append(cv_result)

        for i in range(len(names)):
            print("{} {}: {}".format(names[i], metric_name, results[i].mean()))
        
        print("\n")

print("Unnormalized Results:\n")
dump_results(X_train_two, y_train_two)

print("Normalized Results:\n")
dump_results(normalized_x_train_two, y_train_two)

Unnormalized Results:

Logistic Regression Accuracy Score: 0.955303030303
K-Nearest Neighbour Accuracy Score: 0.671212121212
Naive Bayes Accuracy Score: 0.94696969697
Decision Tree Accuracy Score: 0.910606060606
Random Forest Accuracy Score: 0.991666666667
Support Vector Machine-rbf Accuracy Score: 0.418181818182
AdaBoostClassifier Accuracy Score: 0.884090909091


Logistic Regression F-Measure: 0.955193325193
K-Nearest Neighbour F-Measure: 0.676984454939
Naive Bayes F-Measure: 0.946775329048
Decision Tree F-Measure: 0.897723017723
Random Forest F-Measure: 0.991798941799
Support Vector Machine-rbf F-Measure: 0.281352830176
AdaBoostClassifier F-Measure: 0.87499775477


Logistic Regression Sensitivity: 0.955303030303
K-Nearest Neighbour Sensitivity: 0.671212121212
Naive Bayes Sensitivity: 0.94696969697
Decision Tree Sensitivity: 0.901515151515
Random Forest Sensitivity: 0.982575757576
Support Vector Machine-rbf Sensitivity: 0.418181818182
AdaBoostClassifier Sensitivity: 0.884090909091


L

### Use three feature selection methods to select feature subsets on train set 2 and compute accuracy measures in step 5 for all the classifiers. Repeat for normalized version of train set 2. Do you get improvement in accuracy when you perform feature selection or is it better to use all of the features? Which feature selection strategy gives the best accuracy?