In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
%matplotlib inline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
import time

  from collections import Sequence


In [2]:
cars = pd.read_csv(r'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data')
cars.columns = ['Buying_price', 'Maintenance_price','Doors', 'Persons', 'Luggage_Boot_Size', 'Safety', 'Car_acceptability']

In [3]:
cars.head()

Unnamed: 0,Buying_price,Maintenance_price,Doors,Persons,Luggage_Boot_Size,Safety,Car_acceptability
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [4]:
cars.shape

(1727, 7)

In [5]:
np.unique(cars['Car_acceptability'], return_counts = True)

(array(['acc', 'good', 'unacc', 'vgood'], dtype=object),
 array([ 384,   69, 1209,   65]))

In [6]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
Buying_price         1727 non-null object
Maintenance_price    1727 non-null object
Doors                1727 non-null object
Persons              1727 non-null object
Luggage_Boot_Size    1727 non-null object
Safety               1727 non-null object
Car_acceptability    1727 non-null object
dtypes: object(7)
memory usage: 94.5+ KB


In [7]:
price =  pd.get_dummies(cars['Buying_price'], prefix = 'Buying_price')
maintenance = pd.get_dummies(cars['Maintenance_price'], prefix = 'Maintenance_price')
luggage = pd.get_dummies(cars['Luggage_Boot_Size'], prefix = 'luggage')
safety = pd.get_dummies(cars['Safety'], prefix = 'safety')

In [8]:
X = pd.concat([price, maintenance, luggage, safety], axis=1)

In [9]:
X.head()

Unnamed: 0,Buying_price_high,Buying_price_low,Buying_price_med,Buying_price_vhigh,Maintenance_price_high,Maintenance_price_low,Maintenance_price_med,Maintenance_price_vhigh,luggage_big,luggage_med,luggage_small,safety_high,safety_low,safety_med
0,0,0,0,1,0,0,0,1,0,0,1,0,0,1
1,0,0,0,1,0,0,0,1,0,0,1,1,0,0
2,0,0,0,1,0,0,0,1,0,1,0,0,1,0
3,0,0,0,1,0,0,0,1,0,1,0,0,0,1
4,0,0,0,1,0,0,0,1,0,1,0,1,0,0


In [10]:
y = cars['Car_acceptability']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Handling Class Imbalance

In [12]:
import imblearn
from imblearn.over_sampling import RandomOverSampler

In [13]:
X_train_ros, y_train_ros = RandomOverSampler().fit_sample(X_train, y_train)

# Multi-Layer Perceptron Classifier

In [14]:
start_time = time.clock()

mlp = MLPClassifier(hidden_layer_sizes=(1000,), activation ='logistic', alpha = 0.001)

print(cross_val_score(mlp, X_train_ros, y_train_ros, cv=5))

print('{} seconds'.format(time.clock() - start_time))

  """Entry point for launching an IPython kernel.


[0.85051546 0.85362694 0.87564767 0.87823834 0.86398964]
65.80560399999999 seconds


  import sys


In [15]:
start_time = time.clock()

mlp_one = MLPClassifier(hidden_layer_sizes=(200,200, 200), activation ='logistic', alpha = 0.001)

print(cross_val_score(mlp_one, X_train_ros, y_train_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

  """Entry point for launching an IPython kernel.


[0.8621134  0.88082902 0.89507772 0.89896373 0.8873057 ]
81.29633400000002 seconds


  


In [16]:
start_time = time.clock()

mlp_two = MLPClassifier(hidden_layer_sizes=(500,250, 250), activation ='logistic', alpha = 0.001)
print(cross_val_score(mlp_two, X_train_ros, y_train_ros, cv=5))

print('{} seconds'.format(time.clock() - start_time))

  """Entry point for launching an IPython kernel.


[0.86340206 0.87953368 0.8873057  0.89766839 0.88341969]
134.297624 seconds


  


In [17]:
start_time = time.clock()

mlp_three = MLPClassifier(hidden_layer_sizes=(200,200, 200, 200), activation ='logistic', alpha = 0.001)
print(cross_val_score(mlp_three, X_train_ros, y_train_ros, cv=5))

print('{} seconds'.format(time.clock() - start_time))

  """Entry point for launching an IPython kernel.


[0.84536082 0.87176166 0.88341969 0.89119171 0.88341969]
62.869438 seconds


  


# Test Set Validation

In [18]:
start_time = time.clock()
mlp_three.fit(X_train_ros, y_train_ros)
print(mlp_three.score(X_test, y_test))
print('{} seconds'.format(time.clock() - start_time))

  """Entry point for launching an IPython kernel.


0.791907514450867
26.375969999999995 seconds


  after removing the cwd from sys.path.


# Random Forest

In [19]:
scores = []
parameters = []
est_number = [100, 500,700] 

for value in est_number:
    rfc = RandomForestClassifier(n_jobs = -1, n_estimators = value, class_weight = 'balanced')
    score = np.mean(cross_val_score(rfc, X_train, y_train, n_jobs=-1))
    scores.append(score)
    parameters.append(value)

In [20]:
df = pd.DataFrame(scores)
df['params'] = parameters
df.columns=['scores', 'params']

df.sort_values(by = 'scores', ascending=False).reset_index(drop=True)

Unnamed: 0,scores,params
0,0.790735,500
1,0.790008,100
2,0.789282,700


In [21]:
scores = []
parameters = []

depth = [8, 20, 50] 

for value in depth:
    rfc = RandomForestClassifier(
          n_jobs = -1, 
          class_weight = 'balanced',
          n_estimators = 100, 
          max_depth = value)
    
    score = np.mean(cross_val_score(
                        rfc, 
                        X_train, 
                        y_train,  
                        n_jobs=-1))
    
    scores.append(score)
    parameters.append(value)

In [22]:
df = pd.DataFrame(scores)
df['params'] = parameters
df.columns=['scores', 'params']

df.sort_values(by = 'scores', ascending=False).reset_index(drop=True)

Unnamed: 0,scores,params
0,0.793627,8
1,0.790008,20
2,0.790008,50


# Test Set Validation

In [23]:
start_time = time.clock()
rfc = RandomForestClassifier(
          n_jobs = -1, 
          class_weight = 'balanced',
          n_estimators = 100, 
          max_depth = 8)
rfc.fit(X_train, y_train)
print(rfc.score(X_test, y_test))
print('{} seconds'.format(time.clock() - start_time))

  """Entry point for launching an IPython kernel.


0.8034682080924855
0.2343710000000101 seconds


  if __name__ == '__main__':
