# Iris 꽃받침 / 꽃잎 너비 예측 경진대회 (DACON)
붓꽃의 종류, 꽃받침, 꽃잎의 길이를 이용해 꽃받침의 너비와 꽃잎의 너비를 예측해보세요.


## 1. [Base line] 정규방정식을 이용한 너비예측


### (1) feature 준비

In [6]:
import pandas as pd
import numpy as np

In [7]:
train = pd.read_csv('iris_train.csv')
train

Unnamed: 0,id,species,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
0,0,setosa,4.4,1.4,2.9,0.2
1,1,versicolor,6.4,4.5,3.2,1.5
2,2,virginica,6.2,4.8,2.8,1.8
3,3,virginica,7.2,6.1,3.6,2.5
4,4,setosa,4.9,1.4,3.0,0.2
...,...,...,...,...,...,...
70,70,versicolor,6.5,4.6,2.8,1.5
71,71,versicolor,5.6,3.6,2.9,1.3
72,72,versicolor,6.2,4.5,2.2,1.5
73,73,versicolor,4.9,3.3,2.4,1.0


In [8]:
X = train[['species','sepal length (cm)','petal length (cm)']]
X.head()


Unnamed: 0,species,sepal length (cm),petal length (cm)
0,setosa,4.4,1.4
1,versicolor,6.4,4.5
2,virginica,6.2,4.8
3,virginica,7.2,6.1
4,setosa,4.9,1.4


In [9]:
y_petal = train['petal width (cm)']
y_sepal = train['sepal width (cm)']

In [10]:
def encode_species2int(species):
    if species == 'setosa':
        return 1
    if species == 'virginica':
        return 2
    if species == 'versicolor':
        return 3

In [11]:
species = train['species'] # 데이터 프레임 내부의 species를 선택(시리즈)
print('species의 데이터 타입: ', type(species)) # 데이터 프레임의 컬럼 한열은 시리즈로 선언되어 있는 것을 확인하실 수 있습니다.
print('\n')
encoded_species = species.apply(encode_species2int) # apply method 사용
print('=============species 샘플=============')
print(encoded_species[:5])

species의 데이터 타입:  <class 'pandas.core.series.Series'>


0    1
1    3
2    2
3    2
4    1
Name: species, dtype: int64


In [12]:
X['species'] = encoded_species
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['species'] = encoded_species


Unnamed: 0,species,sepal length (cm),petal length (cm)
0,1,4.4,1.4
1,3,6.4,4.5
2,2,6.2,4.8
3,2,7.2,6.1
4,1,4.9,1.4
...,...,...,...
70,3,6.5,4.6
71,3,5.6,3.6
72,3,6.2,4.5
73,3,4.9,3.3


In [13]:
X.iloc[:,0]

0     1
1     3
2     2
3     2
4     1
     ..
70    3
71    3
72    3
73    3
74    3
Name: species, Length: 75, dtype: int64

In [14]:
ones = np.ones_like(X.iloc[:,0])
ones = ones.reshape(75,-1) # 첫번째 차원은 75 두번째 차원은 알아서 지정되도록 -1로 설정
X = np.concatenate((X,ones),axis=1) 

X[:10] 

array([[1. , 4.4, 1.4, 1. ],
       [3. , 6.4, 4.5, 1. ],
       [2. , 6.2, 4.8, 1. ],
       [2. , 7.2, 6.1, 1. ],
       [1. , 4.9, 1.4, 1. ],
       [2. , 6.5, 5.8, 1. ],
       [1. , 4.3, 1.1, 1. ],
       [3. , 6.7, 5. , 1. ],
       [3. , 6.8, 4.8, 1. ],
       [3. , 6.6, 4.4, 1. ]])

###  (2) 직접 행렬 계산을 통해 회귀모수 구하기

$$ W = (X^T X)^{-1} X^Ty $$

In [15]:
transpose_doted_X = X.T.dot(X) # X의 전치행렬 X.T 와 X 행렬곱
inversed = np.linalg.inv(transpose_doted_X) # X.T dot X 의 역행렬 계산
doted_inv_t = inversed.dot(X.T) # 역행렬과 전치행렬 행렬곱
weight_petal = doted_inv_t.dot(y_petal) # 맞춰야하는 꽃잎 너비와 행렬곱 -> weight계산
weight_petal

array([-0.06746834, -0.21313084,  0.52330681,  0.59200385])

In [16]:
weight_sepal = doted_inv_t.dot(y_sepal)
weight_sepal

array([-0.2158848 ,  0.44342083, -0.21595546,  1.67326006])

In [17]:
# 꽃잎 너비 MAE 
prediction_petal = X.dot(weight_petal)
error_petal = sum(abs(prediction_petal - y_petal)) / len(prediction_petal)
error_petal

0.14795850337252675

In [18]:
# 꽃받침 너비 MAE
prediction_sepal = X.dot(weight_sepal)
error_sepal = sum(abs(prediction_sepal - y_sepal)) / len(prediction_sepal)
error_sepal

0.22038467760709277

### (3) 직접 계산하지 않고 사이킷런 선형회귀 모델을 통해 예측하기

In [19]:
from sklearn.linear_model import LinearRegression

model_petal = LinearRegression() # 꽃잎 너비를 예측하는 모델을 선언
model_petal.fit(X[:, :3], y_petal) # X에 추가했던 ones 제거 (사이킷런의 선형회귀모델에는 절편항이 포함 되어 있기 때문)

model_sepal = LinearRegression() # 꽃받침의 너비를 예측하는모델을 선언\
model_sepal.fit(X[:, :3], y_sepal) # X에 추가했던 ones 제거

LinearRegression()

In [20]:
pre_sepal = model_sepal.predict(X[:, :3])
sk_sepal_error = sum(abs(pre_sepal-y_sepal))/len(pre_sepal)
sk_sepal_error

0.2203846776070929

In [21]:
pre_petal = model_petal.predict(X[:, :3])
scikit_petal_error = sum(abs(pre_petal - y_petal)) / len(pre_petal)
scikit_petal_error

0.14795850337252867

### (4) 제출법

In [22]:
submission = pd.read_csv('sample_submission.csv') # submission 파일 불러오기
test = pd.read_csv('iris_test.csv') #test file 불러오기

In [24]:
test['species'] = test['species'].apply(encode_species2int)

X = test[['species', 'sepal length (cm)', 'petal length (cm)']]

In [25]:
pred_petal = model_petal.predict(X) # 꽃잎길이 예측
pred_sepal = model_sepal.predict(X) # 꽃받침 길이 예측

In [26]:
submission['sepal width (cm)'] = pred_sepal
submission['petal width (cm)'] = pred_petal

submission.to_csv('second_submission.csv', index=False)