In [233]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
plt.rcParams['font.family']='SimHei' #顯示中文
from sklearn.model_selection import train_test_split
%matplotlib inline

# 讀取與觀察資料

In [234]:
train = pd.read_csv('iris/train.csv', encoding = "utf-8", dtype = {'花萼長度' : np.float, '花萼寬度' : np.float, 
                                                                   '花瓣長度' : np.float, '花瓣寬度' : np.float})
test = pd.read_csv('iris/test.csv', encoding = "utf-8")
submission = pd.read_csv('iris/submission.csv', encoding = 'utf-8')
#觀察前五筆
train.head(10)


Unnamed: 0,id,花萼長度,花萼寬度,花瓣長度,花瓣寬度,屬種,type
0,1,5.4,3.7,1.5,0.2,Iris-setosa,1
1,2,4.8,3.4,1.6,0.2,Iris-setosa,1
2,3,4.8,3.0,1.4,0.1,Iris-setosa,1
3,4,4.3,3.0,1.1,0.1,Iris-setosa,1
4,5,5.8,4.0,1.2,0.2,Iris-setosa,1
5,6,5.7,4.4,1.5,0.4,Iris-setosa,1
6,7,5.4,3.9,1.3,0.4,Iris-setosa,1
7,8,5.1,3.5,1.4,0.3,Iris-setosa,1
8,9,5.7,3.8,1.7,0.3,Iris-setosa,1
9,10,5.1,3.8,1.5,0.3,Iris-setosa,1


In [235]:
test.head(10)

Unnamed: 0,id,花萼長度,花萼寬度,花瓣長度,花瓣寬度
0,1,5.1,3.5,1.4,0.2
1,2,4.9,3.0,1.4,0.2
2,3,4.7,3.2,1.3,0.2
3,4,4.6,3.1,1.5,0.2
4,5,5.0,3.6,1.4,0.2
5,6,5.4,3.9,1.7,0.4
6,7,4.6,3.4,1.4,0.3
7,8,5.0,3.4,1.5,0.2
8,9,4.4,2.9,1.4,0.2
9,10,4.9,3.1,1.5,0.1


In [236]:
#資料行列筆數
train.shape

#資料型態
train.dtypes

id        int64
花萼長度    float64
花萼寬度    float64
花瓣長度    float64
花瓣寬度    float64
屬種       object
type      int64
dtype: object

In [237]:
#觀察資料內空值
train.isnull().sum()

#查看空值
print(train[train['花萼寬度'].isnull()])
print(train[train['花萼長度'].isnull()])

      id  花萼長度  花萼寬度  花瓣長度  花瓣寬度             屬種  type
121  122   5.2   NaN   5.1   1.8  Iris-new_type     4
      id  花萼長度  花萼寬度  花瓣長度  花瓣寬度             屬種  type
120  121   NaN   3.0   4.9   1.2  Iris-new_type     4


In [238]:
#刪掉不影響model的資料
del train['id']
del train['屬種']

# 相關性分析

In [239]:
corr = train[['花萼長度', '花萼寬度', '花瓣長度', '花瓣寬度']].corr()
print(corr)

          花萼長度      花萼寬度      花瓣長度      花瓣寬度
花萼長度  1.000000 -0.132300  0.861842  0.813901
花萼寬度 -0.132300  1.000000 -0.437289 -0.368907
花瓣長度  0.861842 -0.437289  1.000000  0.958595
花瓣寬度  0.813901 -0.368907  0.958595  1.000000


In [240]:
#將nan刪除(筆數少不影響結果)
train.dropna(axis=0,inplace=True)

X = train[['花萼長度', '花萼寬度', '花瓣長度', '花瓣寬度']]
y = train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)


# SVM 建模

In [241]:
model=SVC()
model.fit(X_train,y_train)
pred=model.predict(test[['花萼長度', '花萼寬度', '花瓣長度', '花瓣寬度']])
print("SCV Score : ",model.score(X_test,y_test))


SCV Score :  0.972972972972973


In [242]:
from sklearn import metrics

print(metrics.classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       0.91      1.00      0.95        10
          3       1.00      0.92      0.96        13

avg / total       0.98      0.97      0.97        37



# 將預測資料儲存

In [243]:
#預測結果儲存csv
Submission = pd.DataFrame({'id' : submission.id , 'type' : pred})
Submission.to_csv('iris/MySubmission.csv', index=False)

In [244]:
Submission = pd.read_csv('iris/MySubmission.csv', encoding='utf-8',dtype={'type' : np.int32})
Submission

Unnamed: 0,id,type
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,1
8,9,1
9,10,1


# Kaggle ranking

![title](iris/MySubmission.png)