# Loading Data and Importing Modules

In [1]:
import pandas as pd
from sklearn import svm
import numpy as np
from matplotlib import pyplot as plt
import time

penguins= pd.read_csv("penguins.csv")
iris = pd.read_csv("iris.csv")
seeds = pd.read_csv("seeds_dataset.csv")

In [2]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [3]:
iris

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
seeds

Unnamed: 0,Area,Perim,Compact,K.Length,K.Width,Assym,G.Length,Class
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


# Clean the data and Splitting the Data

We wrote a function for each data set to clean the data. Then, we split the data sets into a training set and a testing set.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score

seed_train, seed_test = train_test_split(seeds,test_size=.2)
penguin_train, penguin_test = train_test_split(penguins,test_size=.2)
iris_train, iris_test = train_test_split(iris,test_size=.2)

In [6]:
def prep_penguin_data(data_df):
    #creates copy of dataframe and drops NaN values
    df = data_df.copy()
    df = df.dropna()
    le = preprocessing.LabelEncoder()

    df['species'] = le.fit_transform(df['species'])
    df['island'] = le.fit_transform(df['island'])
    df['sex'] = le.fit_transform(df['sex'])
    #splits target data Species(y) from predictor data(X)
    X=df.drop(['species'],axis=1)
    y=df['species']
    return X,y

X_train_penguin, y_train_penguin=prep_penguin_data(penguin_train)
X_test_penguin, y_test_penguin=prep_penguin_data(penguin_test)

In [7]:
def prep_iris_data(data_df):
    #creates copy of dataframe and drops NaN values
    df = data_df.copy()
    df = df.dropna()
    le = preprocessing.LabelEncoder()

    #encodes Species as 0's,1's, and 2's
    df['Species'] = le.fit_transform(df['Species'])
    #splits target data Species(y) from predictor data(X)
    X=df.drop(['Species'],axis=1)
    y=df['Species']
    return X,y

X_train_iris, y_train_iris=prep_iris_data(iris_train)
X_test_iris, y_test_iris=prep_iris_data(iris_test)

In [8]:
def prep_seed_data(data_df):
    #creates copy of dataframe and drops NaN values
    df = data_df.copy()
    df = df.dropna()

    #splits target data Species(y) from predictor data(X)
    X=df.drop(['Class'],axis=1)
    y=df['Class']
    return X,y

X_train_seed, y_train_seed =prep_seed_data(seed_train)
X_test_seed, y_test_seed =prep_seed_data(seed_test)

# Modeling the data using SVM

In [9]:
from sklearn import metrics
start = time.time()
clf = svm.SVC(kernel='rbf')
clf.fit(X_train_penguin, y_train_penguin)
y_pred_penguin = clf.predict(X_test_penguin)
penguin_accuracy = metrics.accuracy_score(y_test_penguin, y_pred_penguin)
end = time.time()
elapsed_penguin = end - start
print(penguin_accuracy, elapsed_penguin)

0.7205882352941176 0.01570606231689453


In [10]:
y_test_penguin, y_pred_penguin

(327    1
 121    0
 212    2
 313    1
 111    0
       ..
 80     0
 340    1
 173    2
 204    2
 333    1
 Name: species, Length: 68, dtype: int64,
 array([0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0,
        2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0,
        2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2,
        0, 0]))

In [11]:
start = time.time()
clf = svm.SVC(kernel='linear')
clf.fit(X_train_iris, y_train_iris)
y_pred_iris = clf.predict(X_test_iris)
iris_accuracy = metrics.accuracy_score(y_test_iris, y_pred_iris)
end = time.time()
elapsed_iris = end - start
print(iris_accuracy,elapsed_iris)

1.0 0.005362033843994141


In [12]:
y_test_iris, y_pred_iris

(67     1
 140    2
 9      0
 66     1
 57     1
 78     1
 137    2
 147    2
 127    2
 139    2
 95     1
 38     0
 53     1
 84     1
 55     1
 94     1
 90     1
 112    2
 16     0
 99     1
 43     0
 97     1
 74     1
 134    2
 141    2
 86     1
 148    2
 41     0
 117    2
 108    2
 Name: Species, dtype: int64,
 array([1, 2, 0, 1, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 2, 0, 1, 0, 1,
        1, 2, 2, 1, 2, 0, 2, 2]))

In [13]:
end = time.time()
clf = svm.SVC(kernel='linear')
clf.fit(X_train_seed, y_train_seed)
y_pred_seed = clf.predict(X_test_seed)
seed_accuracy = metrics.accuracy_score(y_test_seed, y_pred_seed)
end = time.time()
elapsed_seed = end - start
print(seed_accuracy,elapsed_seed)

0.9285714285714286 2.223583936691284


In [14]:
y_test_seed, y_pred_seed

(189    3
 6      1
 186    3
 71     2
 47     1
 37     1
 178    3
 206    3
 55     1
 119    2
 198    3
 150    3
 172    3
 1      1
 173    3
 133    2
 209    3
 171    3
 5      1
 78     2
 136    2
 132    2
 105    2
 164    3
 187    3
 84     2
 111    2
 196    3
 64     1
 176    3
 94     2
 22     1
 60     1
 174    3
 35     1
 141    3
 62     1
 117    2
 82     2
 48     1
 2      1
 54     1
 Name: Class, dtype: int64,
 array([3, 1, 3, 2, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 3, 2, 3, 3, 1, 2, 2, 2,
        2, 3, 3, 2, 2, 3, 1, 3, 2, 1, 3, 3, 1, 3, 3, 2, 2, 1, 1, 1]))

In [15]:
print(penguin_accuracy, iris_accuracy, seed_accuracy)

0.7205882352941176 1.0 0.9285714285714286


In [16]:
print(elapsed_penguin, elapsed_iris, elapsed_seed)

0.01570606231689453 0.005362033843994141 2.223583936691284
