In [2]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np 

# Iris Dataset

In [3]:
iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [4]:
# view the first 5 rows of the dataframe
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
# adds new column with species names
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
# create training and test data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75

df.head() 

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,False
4,5.0,3.6,1.4,0.2,setosa,True


In [7]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [8]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 110
Number of observations in the test data: 40


In [9]:
# create list of feature column names
features = df.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [10]:
# sinse 'spicies' contains the actual spicies names. We must convert those anmes into digits
y = pd.factorize(train['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
# create an instance of the random forest classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)

clf.fit(train[features], y) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
# apply the classifier we trained to test data
clf.predict(test[features]) 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])

In [13]:
# view predicted probabilities of first 10 observations
clf.predict_proba(test[features])[0:10] 

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

## Evaluate Classifier

In [16]:
# Create actual names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

In [17]:
# view predicted species for first 5 observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [18]:
# view actual species for first 5 observations
test['species'].head() 

3     setosa
7     setosa
9     setosa
13    setosa
22    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [19]:
# Confusion Matrix
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,15,0,0
versicolor,0,18,1
virginica,0,0,6


In [20]:
# Feature Importance
list(zip(train[features], clf.feature_importances_)) 

[('sepal length (cm)', 0.09506967951184177),
 ('sepal width (cm)', 0.03195600006367711),
 ('petal length (cm)', 0.43004863397601645),
 ('petal width (cm)', 0.4429256864484646)]

# MTCARS Dataset 

In [22]:
MTCARS_LOCATION = "datasets/mtcars.csv"

In [23]:
mtcars = pd.read_csv(MTCARS_LOCATION) 

In [24]:
# read first 5 rows
mtcars.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Datsun,710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
1,Merc,240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
2,Merc,230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
3,Fiat,128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
4,Honda,Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2


In [25]:
mtcars.tail()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
27,AMC,Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2
28,Camaro,Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4
29,Pontiac,Firebird,19.2,8,400.0,175,3.08,3.845,17.05,0,0,3,2
30,Ford,Pantera,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4
31,Maserati,Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
