# Random Forests on 1985 Auto Imports

In [10]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
# upload data file, view to first 5 records
path = (r"C:\Users\Jwpel\Downloads")
filename = "\1985_auto_imports.csv"
df = pd.read_csv(r"C:\Users\Jwpel\Downloads\1985_auto_imports.csv")
df.head()
                            


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,115,0,1,0,1,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111,5000,21,27,13495
1,3,115,0,1,0,1,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111,5000,21,27,16500
2,1,115,0,1,0,1,2,2,0,94.5,...,152,5,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,1,1,0,0,3,1,0,99.8,...,109,5,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,1,1,0,0,3,0,0,99.4,...,136,5,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
# data shape
df.shape

(205, 26)

In [5]:
# info on columns, combination of int and float, no missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized_losses    205 non-null int64
make                 205 non-null int64
fuel_type            205 non-null int64
aspiration           205 non-null int64
num_doors            205 non-null int64
body_style           205 non-null int64
drive_wheels         205 non-null int64
engine_location      205 non-null int64
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
engine_type          205 non-null int64
num_cylinders        205 non-null int64
engine_size          205 non-null int64
fuel_system          205 non-null int64
bore                 205 non-null float64
stroke               205 non-null float64
compression_ratio    205 non-null float64
horsepower           205 non-null int64
p

In [7]:
# assign independent variables 
X = df.drop(["symboling"], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
normalized_losses    205 non-null int64
make                 205 non-null int64
fuel_type            205 non-null int64
aspiration           205 non-null int64
num_doors            205 non-null int64
body_style           205 non-null int64
drive_wheels         205 non-null int64
engine_location      205 non-null int64
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
engine_type          205 non-null int64
num_cylinders        205 non-null int64
engine_size          205 non-null int64
fuel_system          205 non-null int64
bore                 205 non-null float64
stroke               205 non-null float64
compression_ratio    205 non-null float64
horsepower           205 non-null int64
peak_rpm             205 non-null int64
c

In [8]:
# assign target variable
y = df["symboling"]

In [20]:
# split the dataset between training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=9)

In [36]:
# create Random Forest Regressor model
regr_rf = RandomForestClassifier(max_depth=None, random_state=2, n_estimators=500)
regr_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [37]:
# assign prediction variable
rf_pred = regr_rf.predict(X_test)
rf_pred

array([ 2,  1,  3, -1,  0, -2,  0,  1, -1,  1,  0,  0,  0,  2,  3,  1,  3,
        2,  1,  0,  2,  1, -1,  1,  0,  0,  1,  1,  0,  1,  1,  3,  0,  1,
        1,  1,  1, -1,  1,  0,  2], dtype=int64)

In [39]:
# import accuracy_score, view accuracy results
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix
accuracy = accuracy_score(y_test, rf_pred)
accuracy

0.8292682926829268

In [31]:
# predictions on test set
y_test.value_counts()

 0    16
 1    10
 2     7
 3     4
-1     3
-2     1
Name: symboling, dtype: int64

In [25]:
# confusion maxtrix created
matrix = confusion_matrix(rf_pred, y_test)
conf_mat = pd.DataFrame(data=matrix, columns=[["Pred:-2", "Pred:-1", "Pred:0", "Pred:1", "Pred:2", "Pred:3"]],
                       index = [["Act:-2", "Act:-1", "Act:0", "Act:1", "Act:2", "Act:3"]])
conf_mat

Unnamed: 0,Pred:-2,Pred:-1,Pred:0,Pred:1,Pred:2,Pred:3
Act:-2,1,0,0,0,0,0
Act:-1,0,3,1,0,0,0
Act:0,0,0,11,0,0,0
Act:1,0,0,3,10,2,0
Act:2,0,0,0,0,5,0
Act:3,0,0,1,0,0,4
