In [1]:

#export
import numpy as np
import pandas as pd
import time
import gc
import random
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer


### Data Import and Cleansing Setup

In [4]:
#export
class Data():
    
     
    def dataAllocation(self,path):
        # TODO: Separate out the x_data and y_data and return each
        # args: string path for .csv file
        # return: pandas dataframe, pandas series
        # -------------------------------
        df = pd.read_csv(path)
        x_data = df.iloc[:,:-1]
        y_data = df.iloc[:,-1]
        
        # ------------------------------- 
        return x_data,y_data
    
     
    def trainSets(self,x_data,y_data):
        # TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.
        # Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 614.
        # args: pandas dataframe, pandas dataframe
        # return: pandas dataframe, pandas dataframe, pandas series, pandas series
        # -------------------------------
        x_train, x_test, y_train, y_test = train_test_split(
         x_data, y_data, test_size=0.3, random_state=614, shuffle = True)
       
        # -------------------------------
        return x_train, x_test, y_train, y_test


In [5]:
data = 'pima-indians-diabetes.csv'

In [6]:
dataset = Data()

In [7]:
x_data,y_data = dataset.dataAllocation(data)

In [8]:
x_train, x_test, y_train, y_test = dataset.trainSets(x_data,y_data)

In [9]:
class LinearRegressionModel():
    
     
    def linearClassifier(self,x_train, x_test, y_train):
        # TODO: Create a LinearRegression classifier and train it.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: numpy array, numpy array
        # -------------------------------
        # ADD CODE HERE
        # Create linear regression object
        regModel = LinearRegression()
        # Train the model using the training sets
        regModel.fit(x_train, y_train)
        # Make predictions using the training set
        y_predict_train = regModel.predict(x_train)
        # Make predictions using the training set
        y_predict_test = regModel.predict(x_test)
        # -------------------------------
        return y_predict_train, y_predict_test

     
    def lgTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy (on the training set) using the accuracy_score method.
        # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE 
        
        y_predict_train[y_predict_train < 0.5]=0        
        y_predict_train[y_predict_train >= 0.5]=1
        train_accuracy = accuracy_score(y_train,y_predict_train,normalize = True)
       
        # -------------------------------   
        return train_accuracy
    
     
    def lgTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy (on the testing set) using the accuracy_score method.
        # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE
        y_predict_test[y_predict_test < 0.5]=0        
        y_predict_test[y_predict_test >= 0.5]=1
        test_accuracy = accuracy_score(y_test,y_predict_test,normalize = True)
        # -------------------------------
        return test_accuracy

In [13]:
linear = LinearRegressionModel()
y_predict_train, y_predict_test = linear.linearClassifier(x_train,x_test, y_train)
print("Linear Regression Train Accuracy: ", linear.lgTrainAccuracy(y_train,y_predict_train))
print("Linear Regression Test Accuracy: ", linear.lgTestAccuracy(y_test,y_predict_test))

Linear Regression Train Accuracy:  0.7839851024208566
Linear Regression Test Accuracy:  0.7316017316017316


### Random Forest Classifier

In [15]:
class RFClassifier():
    
     
    def randomForestClassifier(self,x_train,x_test, y_train):
        # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: RandomForestClassifier object, numpy array, numpy array
        # -------------------------------
        
        rf_clf = RandomForestClassifier(random_state=614)
        rf_clf.fit(x_train, y_train)
        y_predict_train = rf_clf.predict(x_train)
        y_predict_test = rf_clf.predict(x_test)
        # -------------------------------
        return rf_clf,y_predict_train, y_predict_test
    
     
    def rfTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
         
        train_accuracy = accuracy_score(y_train,y_predict_train,normalize = True)
        
        # -------------------------------
        return train_accuracy
    
     
    def rfTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        
        test_accuracy = accuracy_score(y_test,y_predict_test,normalize = True)
        
        # -------------------------------
        return test_accuracy
    
# Feature Importance
    
     
    def rfFeatureImportance(self,rf_clf):
        # TODO: Determine the feature importance as evaluated by the Random Forest Classifier.
        # args: RandomForestClassifier object
        # return: float array
        # -------------------------------
         
        feature_importance = rf_clf.feature_importances_
        
        # -------------------------------
        return feature_importance
    
     
    def sortedRFFeatureImportanceIndicies(self,rf_clf):
        # TODO: Sort them in the ascending order and return the feature numbers[0 to ...].
        #       Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.
        # args: RandomForestClassifier object
        # return: int array
        # -------------------------------
        # ADD CODE HERE
        feature_importance = rf_clf.feature_importances_
        sorted_indices = np.argsort(feature_importance)[::-1]
        # -------------------------------
        return sorted_indices
    
# Hyper-parameter Tuning

    
    def hyperParameterTuning(self,rf_clf,x_train,y_train):
        # TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.
        # args: RandomForestClassifier object, pandas dataframe, pandas series
        # return: GridSearchCV object
        # 'n_estimators': [4, 16, 256]
        # 'max_depth': [2, 8, 16]
        # -------------------------------
        
        grid = {
        'n_estimators': [4, 16, 256],
        'max_depth': [2, 8, 16]
        }
        gscv_rfc = GridSearchCV(rf_clf, grid)
        gscv_rfc.fit(x_train, y_train)
        # -------------------------------
        return gscv_rfc
    
    
    def bestParams(self,gscv_rfc):
        # TODO: Get the best params, using .best_params_
        # args:  GridSearchCV object
        # return: parameter dict
        # -------------------------------
        # ADD CODE HERE
        best_params = gscv_rfc.best_params_
        # -------------------------------
        return best_params
    
    
    def bestScore(self,gscv_rfc):
        # TODO: Get the best score, using .best_score_.
        # args: GridSearchCV object
        # return: float
        # -------------------------------
        # ADD CODE HERE
        best_score = gscv_rfc.best_score_
        # -------------------------------
        return best_score

In [16]:
rf = RFClassifier()
rf_clf,y_predict_train, y_predict_test = rf.randomForestClassifier(x_train,x_test, y_train)
print("randomForestClassifier Function Executed")

randomForestClassifier Function Executed


In [17]:
print("Random Forest Train Accuracy: ",rf.rfTrainAccuracy(y_train,y_predict_train))
print("Random Forest Test Accuracy: ",rf.rfTestAccuracy(y_test,y_predict_test))

Random Forest Train Accuracy:  1.0
Random Forest Test Accuracy:  0.7316017316017316


In [18]:
print("Random Forest Sorted Feature Importance: ",rf.sortedRFFeatureImportanceIndicies(rf_clf))

Random Forest Sorted Feature Importance:  [1 5 7 6 2 4 0 3]


In [19]:
gscv_rfc = rf.hyperParameterTuning(rf_clf,x_train,y_train)
print("HyperParameterTuning Function Executed")

HyperParameterTuning Function Executed


In [20]:
print("Random Forest Best Parameters: ",rf.bestParams(gscv_rfc))
print("Random Forest Best Score: ",rf.bestScore(gscv_rfc))

Random Forest Best Parameters:  {'max_depth': 8, 'n_estimators': 256}
Random Forest Best Score:  0.7858255451713395


### Support Vector Machine

In [23]:
class SupportVectorMachine():
    
# Pre-process

    def dataPreProcess(self,x_train,x_test):
        # TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer.
        # args: pandas dataframe, pandas dataframe
        # return: pandas dataframe, pandas dataframe
        # -------------------------------
        # ADD CODE HERE
        scaler = StandardScaler().fit(x_train)
        scaled_x_train = scaler.transform(x_train)
        scaled_x_test = scaler.transform(x_test)
        # -------------------------------
        return scaled_x_train, scaled_x_test
    
#Classification

    def SVCClassifier(self,scaled_x_train,scaled_x_test, y_train):
        # TODO: Create a SVC classifier and train it. Set gamma = 'auto'
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: numpy array, numpy array
        # -------------------------------

        svm_clf = SVC(gamma = 'auto')
        svm_clf.fit(scaled_x_train,y_train)
        y_predict_train = svm_clf.predict(scaled_x_train)
        y_predict_test = svm_clf.predict(scaled_x_test)
        # -------------------------------
        return y_predict_train,y_predict_test
    

    def SVCTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float 
        # -------------------------------

        train_accuracy = accuracy_score(y_train,y_predict_train)
        # -------------------------------
        return train_accuracy
    
    # points [1]
    def SVCTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float 
        # -------------------------------

        test_accuracy = accuracy_score(y_test,y_predict_test)        
        # -------------------------------
        return test_accuracy
    
# Hyper-parameter Tuning
    

    def SVMBestScore(self, scaled_x_train, y_train):
        # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
        # Note: Set n_jobs = -1 and return_train_score = True and gamma = 'auto'
        # args: pandas dataframe, pandas series
        # return: GridSearchCV object, float
        # -------------------------------
        svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]}
        
        grid_search = GridSearchCV(SVC(gamma = 'auto'), svm_parameters, n_jobs = -1, return_train_score = True)
        svm_cv = grid_search.fit(scaled_x_train, y_train.values)

        best_score = grid_search.best_score_
        
        # -------------------------------
        
        return svm_cv, best_score
    
    def SVCClassifierParam(self,svm_cv,scaled_x_train,scaled_x_test,y_train):
        # TODO: Calculate the training and test set accuracy values after hyperparameter tuning and standardization. 
        # args: GridSearchCV object, pandas dataframe, pandas dataframe, pandas series
        # return: numpy series, numpy series
        # -------------------------------

        sv_HT = svm_cv.fit(scaled_x_train, y_train)
        y_predict_train = sv_HT.predict(scaled_x_train)
        y_predict_test = sv_HT.predict(scaled_x_test)
        # -------------------------------
        return y_predict_train,y_predict_test


    def svcTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy (on the training set) using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------

        train_accuracy = accuracy_score(y_train,y_predict_train)
        # -------------------------------
        return train_accuracy

    # points [1]
    def svcTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy (on the test set) using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------

        test_accuracy = accuracy_score(y_test,y_predict_test)
        # -------------------------------
        return test_accuracy
    
#Cross Validation Results

    def SVMRankTestScore(self,svm_cv):
        # TODO: Return the rank test score for all hyperparameter values that you obtained in Q3.4.3. The 
        # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.
        # args: GridSearchCV object 
        # return: int array
        # -------------------------------

        rank_test_score = svm_cv.cv_results_['rank_test_score']
        # -------------------------------
        return rank_test_score
    
    def SVMMeanTestScore(self,svm_cv):
        # TODO: Return mean test score for all of hyperparameter values that you obtained in Q3.4.3. The 
        # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.
        # args: GridSearchCV object
        # return: float array
        # -------------------------------

        mean_test_score = svm_cv.cv_results_['mean_test_score']
        # -------------------------------
        return mean_test_score
