In [1]:
import xlrd
import matplotlib.pyplot as plt
import numpy as np
import csv
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime
from math import sqrt
from sklearn.model_selection import cross_validate
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing


In [2]:
#Turing problem into binary classification problem
CLASSES = {
    'Very Large Number': 1,
    'Large Number': 1,
    'Medium Number': 0,
    'Small Number': 0,
    'Very Small Number': 0}
NUM_FEATURES = 13
NUM_DATAPOINTS = 100000

In [3]:
X = np.zeros((NUM_DATAPOINTS, NUM_FEATURES))
y = np.zeros((NUM_DATAPOINTS, 1))
y_classifier = np.zeros((NUM_DATAPOINTS, 1))
#Reading data in from csv file
with open('sum_data.csv') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):         
        if(i != 0):
            row = row[0].split(';')
            X[i-1][0] = row[1]
            X[i-1][1] = row[2]
            X[i-1][2] = row[3]
            X[i-1][3] = row[4]
            X[i-1][4] = row[5]
            X[i-1][5] = row[6]
            X[i-1][6] = row[7]
            X[i-1][7] = row[8]
            X[i-1][8] = row[9]
            X[i-1][9] = row[10]
            y[i-1][0] = row[11]
            y_classifier[i-1][0] = CLASSES[row[12]]
            
        if(i == NUM_DATAPOINTS):
            break
#Normalising data            
X = preprocessing.scale(X)
y = preprocessing.scale(y)

In [4]:
class linear_regression:
    """A class that takes in a numpy matrix X and column vector y 
        Allows you to apply 7030 split and 10 fold cross fold validation
        with metrics returned as a tuple as so:
        (RMSE, MAE)
    """
    def __init__(self, X,y):
        self.X = X
        self.y = y

    def split7030(self):
        """Applies 70 30 split"""
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3)
        regr = linear_model.LinearRegression(fit_intercept = False)
        #regr.fit(Pickup_Time_Train, Trip_Duration_Train)
        regr.fit(X_train, y_train)

        #Trip_Duration_Predict = regr.predict(Pickup_Time_Test)
        y_predict = regr.predict(X_test)
        metric1 = sqrt(mean_squared_error(y_test, y_predict))

        sum = 0
        for i, y in enumerate(y_test):
            sum =  sum + abs(y-y_predict[i])

        mean = sum/y_test.shape[0]
        metric2 = mean
        return metric1, metric2[0]
    
    def crossval10(self):
        """Applies 10 fold cross validation"""
        regr = linear_model.LinearRegression(fit_intercept = False)

        cv = cross_validate(regr, self.X, self.y, scoring=make_scorer(self.score_func_RMSE), cv=10)
        regr = linear_model.LinearRegression(fit_intercept = False)
        cv2 = cross_validate(regr, self.X, self.y, scoring=make_scorer(self.score_func_AE), cv=10)
        return np.mean(cv['test_score']), np.mean(cv2['test_score'])
    
    def score_func_RMSE(self,y, y_pred):
        """RMSE scorer function for cross validation"""
        RMSEsum = 0
        for i, yi in enumerate(y):
            RMSEsum = RMSEsum + (yi-y_pred[i])**2
        RMSE = sqrt(RMSEsum/y.shape[0])

        return RMSE
    
    def score_func_AE(self,y, y_pred):
        """MAE scorer function for cross validation"""
        AEsum = 0
        for i, yi in enumerate(y):
            AEsum = AEsum + abs(yi-y_pred[i])
        AE = AEsum/y.shape[0]
        
        return AE

In [5]:
lr = linear_regression(X, y)

In [6]:
result = lr.crossval10()
print('10 fold cross validation')
print("Root mean square error: ", result[0])
print("Mean absolute error: ", result[1])

10 fold cross validation
Root mean square error:  0.11997623993
Mean absolute error:  0.0898248169899


In [7]:
result = lr.split7030()
print('70 30 split')
print("Root mean square error: ", result[0])
print("Mean absolute error: ", result[1])

70 30 split
Root mean square error:  0.12038215366530755
Mean absolute error:  0.0902028925195


In [8]:
class KNN:
    
    """A class that takes in a numpy matrix X and column vector y 
        and implements the k nearest neighbour algorithm with k equal to 7
        Allows you to apply 7030 split and 10 fold cross fold validation
        with metrics returned as a tuple as so:
        (f1 score, accuracy)"""
    def __init__(self, X, y_classifier):
        self.X = X
        self.y = y_classifier
        self.num_neigh = 7

    def split7030(self):
        """Applies 70 30 split testing"""
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3)
        neigh = KNeighborsClassifier(n_neighbors=5)
        neigh.fit(X_train, y_train)
        y_predict = neigh.predict(X_test)
        metric1 = f1_score(y_test, y_predict)
        metric2 = accuracy_score(y_test, y_predict)

        return metric1, metric2

    def crossval10(self):
        """Applies 10 fold cross validation testing"""
        neigh = KNeighborsClassifier(n_neighbors = self.num_neigh)
        cv = cross_validate(neigh, self.X, self.y, scoring = make_scorer(f1_score), cv=10)
        neigh = KNeighborsClassifier(n_neighbors=self.num_neigh)
        cv2 = cross_validate(neigh, self.X, self.y, scoring = make_scorer(accuracy_score), cv=10)


        return np.mean(cv['test_score']), np.mean(cv2['test_score'])


In [9]:
knn = KNN(X, y_classifier.ravel())



In [10]:
result = knn.split7030()
print("70 30 split")
print("f1 score: ", result[0], "Accuracy score: ", result[1])

70 30 split
f1 score:  0.998413700172 Accuracy score:  0.9969


In [11]:
result = knn.crossval10()
print("10 fold cross validation")
print("f1 score: ", result[0], "Accuracy score: ", result[1])

10 fold cross validation
f1 score:  0.998462511717 Accuracy score:  0.996989979981


In [12]:
class RandomForest:
    """A class that takes in a numpy matrix X and column vector y 
        and implements the random forest algorithm with a max depth of 2
        Allows you to apply 7030 split and 10 fold cross fold validation
        with metrics returned as a tuple as so:
        (f1 score, accuracy)"""
    def __init__(self, X, y_classifier):
        self.X = X
        self.y = y_classifier
        
    def split7030(self):
        """Applies 70 30 split testing"""
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3)
        rf = RandomForestClassifier(max_depth=2, random_state=0)
        rf.fit(X_train, y_train)
        y_predict = rf.predict(X_test)
        metric1 = f1_score(y_test, y_predict)

        metric2 = accuracy_score(y_test, y_predict)
        
        return metric1, metric2
        
    def crossval10(self):
        """Applies 10 fold cross validation testing"""
        rf = RandomForestClassifier(max_depth=2, random_state=0)
        cv = cross_validate(rf, self.X, self.y, scoring = make_scorer(f1_score), cv=10)
        rf = RandomForestClassifier(max_depth=2, random_state=0)
        cv2 = cross_validate(rf, self.X, self.y, scoring = make_scorer(accuracy_score), cv=10)
        
        
        return np.mean(cv['test_score']), np.mean(cv2['test_score'])

In [13]:
rf = RandomForest(X, y_classifier.ravel())

In [14]:
result = rf.split7030()
print('70 30 split')
print('f1 score: ', result[0])
print('accuracy: ', result[1])

70 30 split
f1 score:  0.998519728447
accuracy:  0.9971


In [15]:
result = rf.crossval10()
print('10 fold cross validation')
print('f1 score: ', result[0])
print('accuracy: ', result[1])

10 fold cross validation
f1 score:  0.998539499497
accuracy:  0.997139988981
