In [79]:
#Loading the library with the iris dataset
from sklearn.datasets import load_iris

#Loading scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

#Loading Pandas
import pandas as pd

#Loading numpy
import numpy as np

#Setting random seed
np.random.seed(0)

from sklearn.metrics import accuracy_score

In [80]:
#Creating an object called iris with the iris data
iris = load_iris()

#Creating a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)

#Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [81]:
#Adding a new column for the species name
df['species']=pd.Categorical.from_codes(iris.target, iris.target_names)

#Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [82]:
df['is_train'] = np.random.uniform(0, 1, len(df))
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,0.548814
1,4.9,3.0,1.4,0.2,setosa,0.715189
2,4.7,3.2,1.3,0.2,setosa,0.602763
3,4.6,3.1,1.5,0.2,setosa,0.544883
4,5.0,3.6,1.4,0.2,setosa,0.423655


In [83]:
#Creating Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

#View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [84]:
#Creating dataframes with test rows and training rows
train, test = df[df['is_train'] == True],df[df['is_train'] == False]

#Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the testing data:', len(test))

Number of observations in the training data: 112
Number of observations in the testing data: 38


In [2]:
#Creating a list of the feature column's names
features = df.columns[:4]

#View features
features #與print(features)一樣

NameError: name 'df' is not defined

In [101]:
#Converting each specires name into digits
y = pd.factorize(train['species'])[0]

#Viewing target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [87]:
#Creating a random forest Classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)

#Training the classifier
clf.fit(train[features],y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [88]:
test[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
1,4.9,3.0,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
13,4.3,3.0,1.1,0.1
14,5.8,4.0,1.2,0.2


In [89]:
#Applying the trained Classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [90]:
#Viewing the predicted probabilities of the first 1- observations
clf.predict_proba(test[features])[0:10]

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.9, 0.1, 0. ],
       [0.9, 0.1, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])

In [104]:
#mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

#View the PREDICTED species for the first five observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [91]:
clf.predict_proba(test[features])[10:20]

array([[0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 1. , 0. ],
       [0. , 0.2, 0.8]])

In [105]:
preds[10:20]

array(['versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'virginica'], dtype='<U10')

In [108]:
clf.predict_proba(test[features])[20:30]
#碰到0.5 0.5就assign第一個0.5

array([[0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0.2, 0.8],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0.4, 0.6],
       [0. , 0.5, 0.5],
       [0. , 0. , 1. ],
       [0. , 0.4, 0.6]])

In [109]:
preds[20:30]

array(['virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'versicolor', 'virginica', 'virginica'],
      dtype='<U10')

In [106]:
#Viewing the ACTUAL species for the first five observations
test['species'].head()

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [107]:
#Creating confusion matrix
pd.crosstab(test['species'],preds, rownames =['Actual Species'], colnames=['Prediction Species'])

Prediction Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,1,18


In [115]:
preds = iris.target_names[clf.predict( [[5.0, 3.6 , 1.4, 2.0], [5.8, 4.0, 1.2, 0.2]] )]
preds

array(['virginica', 'setosa'], dtype='<U10')