In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cross_validation import train_test_split 
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import metrics 
accuracy = metrics.accuracy_score



In [2]:
data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None) 
data.columns = ["Sepal Length", "Sepal Width","Petal Length","Petal Width","Class"]
data.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
le = preprocessing.LabelEncoder()
le.fit(np.array(data.Class))
data["Class"] = le.transform(data.Class)
data.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
X = data.ix[:,0:4]
target = data["Class"]

In [5]:
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size = 0.35, random_state = 1)

In [6]:
print ("n Observations, Features For Training Set: ", X_train.shape)
print ("n Response Vector For Training Set: ", target_train.shape)
print ("n Observations, Features For Testing Set: ", X_test.shape)
print ("n Response Vector For Testing Set: ", target_test.shape)

n Observations, Features For Training Set:  (97, 4)
n Response Vector For Training Set:  (97,)
n Observations, Features For Testing Set:  (53, 4)
n Response Vector For Testing Set:  (53,)


### KNN 

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, target_train)
target_pred = knn.predict(X_test)
print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")

98.11 % Accuracy


In [8]:
knn = KNeighborsClassifier(n_neighbors = 10)
results = cross_val_score(knn, X, target, cv=5)
print(results.mean())

0.98


### Bagging

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
est = KNeighborsClassifier(n_neighbors = 10)
bag = BaggingClassifier(base_estimator=est, 
                        n_estimators = 100, 
                        max_samples = 0.35,
                        max_features = 4,
                        random_state = 1, 
                        oob_score = True)
bag.fit(X_train, target_train)

print(bag.score(X_test, target_test))
print(bag.oob_score_)
print(bag.score(X_train, target_train))

0.981132075472
0.927835051546
0.938144329897


In [10]:
est = KNeighborsClassifier(n_neighbors = 10)
bag = BaggingClassifier(base_estimator=est, 
                        n_estimators = 100, 
                        max_samples = 0.35,
                        max_features = 4,
                        random_state = 1, 
                        oob_score = True)
results = cross_val_score(bag, X, target, cv=5)
print(results.mean())

0.96


### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
est = RandomForestClassifier()
rf = RandomForestClassifier(n_estimators = 100, oob_score = True)
rf.fit(X_train, target_train)

print(rf.score(X_test, target_test))
print(rf.oob_score_)
print(rf.score(X_train, target_train))

0.962264150943
0.927835051546
1.0


In [12]:
est = RandomForestClassifier()
rf = RandomForestClassifier(n_estimators = 10)
results = cross_val_score(rf, X, target, cv=5)
print(results.mean())

0.953333333333


### AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

est = AdaBoostClassifier()
model = RandomForestClassifier()
ada = AdaBoostClassifier(base_estimator = model, n_estimators = 100)
ada.fit(X_train, target_train)
print(ada.score(X_test, target_test))
print(ada.score(X_train, target_train))

0.962264150943
1.0


In [14]:
est = RandomForestClassifier()
ada = AdaBoostClassifier(base_estimator = est, n_estimators = 10)
results = cross_val_score(ada, X, target, cv=5)
print(results.mean())

0.96


### Gradient Tree Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

est = GradientBoostingClassifier()
gbc = GradientBoostingClassifier(n_estimators = 100)
gbc.fit(X_train, target_train)
print(gbc.score(X_test, target_test))
print(gbc.score(X_train, target_train))

0.962264150943
1.0


In [16]:
gbc = GradientBoostingClassifier(n_estimators = 100)
results = cross_val_score(gbc, X, target, cv=5)
print(results.mean())

0.96


### Voting Classifier

In [17]:
>>> from sklearn.model_selection import cross_val_score
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.naive_bayes import GaussianNB
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.ensemble import VotingClassifier

>>> clf1 = LogisticRegression(random_state = 1)
>>> clf2 = RandomForestClassifier(random_state = 1)
>>> clf3 = GaussianNB()
>>> clf4 = KNeighborsClassifier(n_neighbors = 8)
>>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('knn', clf4)], voting='hard')


for clf, label in zip([clf1, clf2, clf3, clf4, eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'KNN', 'Ensemble']):
    scores = cross_val_score(clf, X, target, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f, (+/- %0.2f), [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96, (+/- 0.04), [Logistic Regression]
Accuracy: 0.97, (+/- 0.02), [Random Forest]
Accuracy: 0.95, (+/- 0.03), [Naive Bayes]
Accuracy: 0.97, (+/- 0.03), [KNN]
Accuracy: 0.97, (+/- 0.02), [Ensemble]


In [18]:
estimator = bag
estimator.fit(X_train, target_train)
target_pred = estimator.predict(X_test)
print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")

98.11 % Accuracy


In [19]:
actual = pd.DataFrame(target_test)
actual = actual.reset_index(drop=True)
actual.head()

Unnamed: 0,Class
0,0
1,1
2,1
3,0
4,2


In [20]:
predictions = pd.DataFrame(target_pred)
predictions.columns = ["Predictions"]
predictions.head()

Unnamed: 0,Predictions
0,0
1,1
2,1
3,0
4,2


In [21]:
pd.DataFrame.join(actual, predictions)

Unnamed: 0,Class,Predictions
0,0,0
1,1,1
2,1,1
3,0,0
4,2,2
5,1,1
6,2,2
7,0,0
8,0,0
9,2,2
