## 引入套件

In [1]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn import metrics

## 載入資料

In [2]:
# Load in the train datasets
train = pd.read_csv('data/train.csv', encoding = "utf-8", dtype = {'type': np.int32})
test = pd.read_csv('data/test.csv', encoding = "utf-8")
submission = pd.read_csv('data/submissionOld.csv', encoding = "utf-8", dtype = {'type': np.int32})

In [3]:
test.head()

Unnamed: 0,id,花萼長度,花萼寬度,花瓣長度,花瓣寬度
0,1,5.1,3.5,1.4,0.2
1,2,4.9,3.0,1.4,0.2
2,3,4.7,3.2,1.3,0.2
3,4,4.6,3.1,1.5,0.2
4,5,5.0,3.6,1.4,0.2


In [4]:
train.shape

(123, 7)

In [5]:
submission.head()

Unnamed: 0,id,type
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


### 訓練資料有空值，先處理

In [6]:
#直接把 NaN drop (如果筆數很少,不影響建模的時候)
train_d_na = train.dropna().reset_index(drop=True)
train_d_na.shape # 印出檢查

(121, 7)

## 切分資料來訓練

In [7]:
X = train_d_na.iloc[:,1:5]
y = train_d_na.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=40, shuffle=True)
# 將已標好標籤的資料切割成train和test，以方便測驗自己的準確度，也可幫助避免 overfitting 的問題

### 標準化

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
X_train_std

array([[-0.05638233, -0.91440255,  0.07088322, -0.02968594],
       [ 0.71163645, -0.91440255,  0.88019065,  0.88901584],
       [-0.56839485,  0.85424449, -1.31650095, -1.07963083],
       [ 0.83963958,  0.34891676,  0.7645753 ,  1.02025895],
       [ 1.09564584,  0.0962529 ,  0.35992159,  0.23280028],
       [-0.95240424,  1.86489993, -1.31650095, -1.21087394],
       [-0.44039172,  1.10690835, -1.43211629, -1.34211705],
       [ 1.22364897,  0.34891676,  1.22703669,  1.41398828],
       [-1.2084105 ,  0.0962529 , -1.31650095, -1.47336017],
       [-0.82440111,  0.85424449, -1.37430862, -1.34211705],
       [-1.33641363,  0.0962529 , -1.25869327, -1.34211705],
       [ 2.37567714, -0.15641096,  1.34265204,  1.41398828],
       [ 0.71163645,  0.34891676,  0.88019065,  1.41398828],
       [ 0.32762706,  0.34891676,  0.7645753 ,  0.75777273],
       [ 2.37567714, -1.16706641,  1.80511342,  1.41398828],
       [ 1.73566149,  0.34891676,  1.28484436,  0.75777273],
       [-0.95240424,  1.

## 建立模型

In [9]:
model = LinearSVC(random_state=0) # 選用 LinearSVC 做 model
model.fit(X_train_std, y_train) # training data 套用到 model 學習

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [10]:
print(metrics.classification_report(y_test, model.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, model.predict(X_test_std)))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00        12
          2       0.93      1.00      0.97        14
          3       1.00      0.91      0.95        11

avg / total       0.97      0.97      0.97        37

[[12  0  0]
 [ 0 14  0]
 [ 0  1 10]]


In [11]:
# 使用先前 training set 的 scale 將 testing dataset 做縮放
test_std = sc.transform(test.iloc[:,1:5])
# 預測結果
model_predict = model.predict(test_std)
model_predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3], dtype=int32)

## 產生 Submission file

In [12]:
# Generate Submission File 
SVCSubmission = pd.DataFrame({ 'id': submission.id, 'type': model_predict })
SVCSubmission.to_csv("data/submission.csv", index=False)

In [13]:
# 再讀入確認結果
submission = pd.read_csv('data/submission.csv', encoding = "utf-8", dtype = {'type': np.int32})
submission

Unnamed: 0,id,type
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,1
8,9,1
9,10,1


## 我的結果

![My result](week1Result.png)