In [16]:
#importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.datasets import load_iris

In [8]:
#load data
data = load_iris()
class_labels =list(data.target_names)
column_names = data.feature_names.append("label")

df = pd.DataFrame(columns=data.feature_names)
for i in range(len(data["data"])):
    features = list(data["data"][i]) #the main features
    label = data.target[[i]][0] #the class label, with 0, 1, or 2
    features.append(label)
    
    #inserting a new row into the df
    newSeries = pd.Series(features,index=df.columns)
    df = df.append(newSeries,ignore_index=True)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [12]:
#use feature importance to see which features are good
all_Y = df["label"]
all_X = df.drop("label",axis=1)

#split training and testing to see feature importance
X_train,X_test,Y_train,Y_test = train_test_split(all_X,all_Y,test_size=0.2,random_state=42)

feature_importance_clf = RandomForestClassifier()
feature_importance_clf.fit(X_train,Y_train) #train it on the examples
feature_importance_clf.score(X_test,Y_test) #add scores to it based off of the 'true' test examples

feature_importances = pd.DataFrame(feature_importance_clf.feature_importances_,
                                   index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)

#after seeing this, its clear that petal width and petal length are the most important indicators
columns = ["petal length (cm)","petal width (cm)"]
X = df[columns]
Y = df["label"]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [18]:
#set up the models and train them

#random forest classifier
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train,Y_train)
rf_predictions = rf_model.predict(X_test)

#SVM
svm_model = svm.SVC(random_state=0)
svm_model.fit(X_train,Y_train)
svm_predictions = svm_model.predict(X_test)

#logistic regression classifier
log_model = LogisticRegression(random_state=0)
log_model.fit(X_train,Y_train)
log_predictions = log_model.predict(X_test)

In [22]:
#confusion matrices
svm_confmat = confusion_matrix(Y_test,svm_predictions)
svm_confmat #matrix is diagonal which means that all things have been predicted correctly


array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]], dtype=int64)

In [19]:
#evaluating the scores now
rf_acc = accuracy_score(Y_test,rf_predictions)
print("rf: ",rf_acc)
svm_acc = accuracy_score(Y_test,svm_predictions)
print("svm: ",svm_acc)
log_acc = accuracy_score(Y_test,log_predictions)
print("log: ",log_acc)

#all models seem to have 100% accuracy... check for cross validation lol

rf:  1.0
svm:  1.0
log:  1.0


In [21]:
#check for cross validation cuz that seems too good to be true
new_rf = RandomForestClassifier(random_state=0)
rf_score = cross_val_score(new_rf,X,Y,cv=10)

new_svm = svm.SVC(random_state=0)
svm_score = cross_val_score(new_svm,X,Y,cv=10)

new_log = LogisticRegression(random_state=0)
log_score = cross_val_score(new_rf,X,Y,cv=10)

print("cross validation")
print("rf: ",np.mean(rf_score))
print("svm: ",np.mean(svm_score))
print("log: ",np.mean(log_score))

#all methods seem to have 97% for cross validation, seems to be good lol

cross validation
rf:  0.9666666666666666
svm:  0.96
log:  0.9666666666666666
