In [1]:
%matplotlib inline

import sklearn

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

import csv
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, make_scorer, mean_squared_error

  from numpy.core.umath_tests import inner1d


In [4]:
includeMakeAndModel = True

# Number of trees in forest
nEstimators = 500

def GetDataMatrix():
    
    # Data frame with make and model
    Xmodelmake = pd.read_csv("data.csv",header=0, usecols=(0,1,2,3,4,5,6,7,8,9,10,11,13,14,));
    
    # Excluding make and model
    if not includeMakeAndModel:
        X = pd.read_csv("data.csv",header=0, usecols=(2,3,4,5,6,7,8,9,10,11,13,14,));
    else:
        X = Xmodelmake
    Y = pd.read_csv("data.csv",header=0, usecols=(15,));

    X, Y, Xmodelmake = shuffle(X, Y, Xmodelmake)
    Xmake = Xmodelmake['Make']
    Xmodel = Xmodelmake['Model']
    
    # Turns categorical data into binary values across many columns
    if not includeMakeAndModel:
        X = pd.get_dummies(X, dummy_na = False, columns=['Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style'] );
    else:    
        X = pd.get_dummies(X, dummy_na = False, columns=['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style'] );
    
    X.insert(0, 'ModelRef', Xmodel);
    X.insert(0, 'MakeRef', Xmake);
    
    # Fill the null values with zeros
    X.fillna(0, inplace=True);
    return (X, Y, Xmodelmake)

In [5]:
(X, Y, Xmodelmake) = GetDataMatrix() #Gets the X,Y

# Turn into a proper one D arrayY = numpy.ravel(Y);
Y_unraveled = np.ravel(Y);

In [10]:
X.head()

Unnamed: 0,MakeRef,ModelRef,Year,Engine HP,Engine Cylinders,Number of Doors,city mpg,Popularity,Make_Acura,Make_Alfa Romeo,...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
9110,Kia,Sephia,2001,125.0,4.0,4.0,22,1720,0,0,...,0,0,0,0,0,0,0,0,1,0
9538,Chevrolet,Silverado 1500,2015,285.0,6.0,2.0,18,1385,0,0,...,0,0,0,0,0,0,0,1,0,0
4162,Cadillac,Escalade,2015,420.0,8.0,4.0,14,1624,0,0,...,0,0,0,0,0,0,0,0,0,0
8236,Dodge,Ramcharger,1993,230.0,8.0,2.0,11,1851,0,0,...,0,0,0,0,0,0,0,0,0,0
9458,Chevrolet,Silverado 1500 Classic,2007,310.0,8.0,4.0,13,1385,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
print('Splitting into training and testing...')
X_train, X_test, Y_train, y_test = train_test_split(X, Y_unraveled, test_size=0.10, random_state=32)
MSE_Scorer = make_scorer(mean_squared_error);

# Model/Make columns are only used later on to relate indices to Model/Makes
X_train2 = X_train.drop('MakeRef', axis = 1).drop('ModelRef', axis = 1)
X_test2 = X_test.drop('MakeRef', axis = 1).drop('ModelRef', axis = 1)

# Train using Random Forest
print('Training classifier...')
clf = RandomForestRegressor(n_estimators=nEstimators, max_features="sqrt");
# The gradient boosting classifier didnt finish running
# clf = GradientBoostingClassifier(n_estimators=5)
clf = clf.fit(X_train2, Y_train);
print("Done training best classifier.")

Splitting into training and testing...
Training classifier...
Done training best classifier.


In [12]:
X_test2.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,city mpg,Popularity,Make_Acura,Make_Alfa Romeo,Make_Aston Martin,Make_Audi,...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
11667,2004,185.0,6.0,4.0,15,481,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6265,2011,180.0,4.0,4.0,23,481,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11897,2015,240.0,4.0,2.0,22,3916,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4396,2015,365.0,6.0,4.0,16,5657,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2876,2016,521.0,8.0,2.0,15,520,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
