In [114]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [115]:
# 데이터 불러오기
data_ = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

data_
# 수치형 데이터 및 문자열데이터가 섞인것 확인

Unnamed: 0,id,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
0,0,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,50.0,15.3,220,MALE,8.30515,-25.19017,5550
1,1,Chinstrap penguin (Pygoscelis antarctica),Dream,No,49.5,19.0,200,MALE,9.63074,-24.34684,3800
2,2,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,45.1,14.4,210,FEMALE,8.51951,-27.01854,4400
3,3,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,44.5,14.7,214,FEMALE,8.20106,-26.16524,4850
4,4,Gentoo penguin (Pygoscelis papua),Biscoe,No,49.6,16.0,225,MALE,8.38324,-26.84272,5700
...,...,...,...,...,...,...,...,...,...,...,...
109,109,Adelie Penguin (Pygoscelis adeliae),Torgersen,Yes,36.6,17.8,185,FEMALE,,,3700
110,110,Adelie Penguin (Pygoscelis adeliae),Dream,Yes,39.2,18.6,190,MALE,9.11006,-25.79549,4250
111,111,Adelie Penguin (Pygoscelis adeliae),Dream,Yes,43.2,18.5,192,MALE,8.97025,-26.03679,4100
112,112,Chinstrap penguin (Pygoscelis antarctica),Dream,No,46.9,16.6,192,FEMALE,9.80589,-24.73735,2700


In [116]:
# 데이터 결측치 갯수 확인
print(data_.isnull().sum(), "\n","___________", "\n",test_df.isnull().sum())


id                     0
Species                0
Island                 0
Clutch Completion      0
Culmen Length (mm)     0
Culmen Depth (mm)      0
Flipper Length (mm)    0
Sex                    3
Delta 15 N (o/oo)      3
Delta 13 C (o/oo)      3
Body Mass (g)          0
dtype: int64 
 ___________ 
 id                     0
Species                0
Island                 0
Clutch Completion      0
Culmen Length (mm)     0
Culmen Depth (mm)      0
Flipper Length (mm)    0
Sex                    6
Delta 15 N (o/oo)      9
Delta 13 C (o/oo)      8
dtype: int64


In [117]:
### 데이터 전처리
## 문자열 데이터 더미화
sub = pd.get_dummies(data_[['Species', 'Island', 'Sex', 'Clutch Completion']])
data_first = pd.concat([sub ,data_.iloc[:,4:7], data_.iloc[:,8:]], axis=1)

In [118]:
# 상관계수 확인, id는 활용 안할 예정이기때문에 제외 후 확인
data_first.corr()

Unnamed: 0,Species_Adelie Penguin (Pygoscelis adeliae),Species_Chinstrap penguin (Pygoscelis antarctica),Species_Gentoo penguin (Pygoscelis papua),Island_Biscoe,Island_Dream,Island_Torgersen,Sex_FEMALE,Sex_MALE,Clutch Completion_No,Clutch Completion_Yes,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
Species_Adelie Penguin (Pygoscelis adeliae),1.0,-0.397197,-0.639115,-0.420411,0.119228,0.478719,-0.065138,0.031431,-0.07837,0.07837,-0.820508,0.522864,-0.688047,0.172781,-0.08758,-0.507076
Species_Chinstrap penguin (Pygoscelis antarctica),-0.397197,1.0,-0.451985,-0.529999,0.668494,-0.190145,0.082247,-0.05431,0.301768,-0.301768,0.377195,0.378706,-0.253306,0.648444,0.760515,-0.389398
Species_Gentoo penguin (Pygoscelis papua),-0.639115,-0.451985,1.0,0.852803,-0.676123,-0.305956,-0.005614,0.014964,-0.176724,0.176724,0.481414,-0.825594,0.881057,-0.71225,-0.557381,0.819209
Island_Biscoe,-0.420411,-0.529999,0.852803,1.0,-0.792825,-0.358766,0.017555,0.0,-0.228665,0.228665,0.296513,-0.72478,0.739562,-0.644446,-0.58851,0.70805
Island_Dream,0.119228,0.668494,-0.676123,-0.792825,1.0,-0.284438,0.027836,0.013911,0.315199,-0.315199,-0.047199,0.559757,-0.566061,0.703095,0.673803,-0.573564
Island_Torgersen,0.478719,-0.190145,-0.305956,-0.358766,-0.284438,1.0,-0.070247,-0.021307,-0.123056,0.123056,-0.394139,0.282796,-0.296392,-0.068307,-0.114119,-0.235331
Sex_FEMALE,-0.065138,0.082247,-0.005614,0.017555,0.027836,-0.070247,1.0,-0.948714,-0.10236,0.10236,-0.29462,-0.364386,-0.238123,-0.132672,0.102378,-0.405566
Sex_MALE,0.031431,-0.05431,0.014964,0.0,0.013911,-0.021307,-0.948714,1.0,0.120368,-0.120368,0.332699,0.349191,0.252635,0.135789,-0.092598,0.419098
Clutch Completion_No,-0.07837,0.301768,-0.176724,-0.228665,0.315199,-0.123056,-0.10236,0.120368,1.0,-1.0,0.119988,0.210304,-0.105076,0.27789,0.29119,-0.093065
Clutch Completion_Yes,0.07837,-0.301768,0.176724,0.228665,-0.315199,0.123056,0.10236,-0.120368,-1.0,1.0,-0.119988,-0.210304,0.105076,-0.27789,-0.29119,0.093065


In [119]:
### 상관계수값이 0과 비슷한 상태인 Clutch completion를 제거

### Train 데이터를 다시 정리합니다.
## 문자열 데이터 더미화
sub = pd.get_dummies(data_[['Species', 'Island', 'Sex']])
data = pd.concat([sub ,data_.iloc[:,4:7], data_.iloc[:,8:]], axis=1)

## Delta 관련 데이터 보간
# 결측치 ~ 평균치를 통한 결측 대치
data['Delta 15 N (o/oo)'].fillna(data['Delta 15 N (o/oo)'].mean(), inplace=True)
data['Delta 13 C (o/oo)'].fillna(data['Delta 13 C (o/oo)'].mean(), inplace=True)

data

Unnamed: 0,Species_Adelie Penguin (Pygoscelis adeliae),Species_Chinstrap penguin (Pygoscelis antarctica),Species_Gentoo penguin (Pygoscelis papua),Island_Biscoe,Island_Dream,Island_Torgersen,Sex_FEMALE,Sex_MALE,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
0,0,0,1,1,0,0,0,1,50.0,15.3,220,8.305150,-25.190170,5550
1,0,1,0,0,1,0,0,1,49.5,19.0,200,9.630740,-24.346840,3800
2,0,0,1,1,0,0,1,0,45.1,14.4,210,8.519510,-27.018540,4400
3,0,0,1,1,0,0,1,0,44.5,14.7,214,8.201060,-26.165240,4850
4,0,0,1,1,0,0,0,1,49.6,16.0,225,8.383240,-26.842720,5700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,1,0,0,0,0,1,1,0,36.6,17.8,185,8.737634,-25.723051,3700
110,1,0,0,0,1,0,0,1,39.2,18.6,190,9.110060,-25.795490,4250
111,1,0,0,0,1,0,0,1,43.2,18.5,192,8.970250,-26.036790,4100
112,0,1,0,0,1,0,1,0,46.9,16.6,192,9.805890,-24.737350,2700


In [120]:
# 데이터 train, test 분류
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=44)

# 다양한 머신러닝 모델을 활용하여 Score 도출

In [121]:
model = Ridge().fit(x_train, y_train)
print(f'Ridge: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = Lasso().fit(x_train, y_train)
print(f'Lasso: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = DecisionTreeRegressor().fit(x_train, y_train)
print(f'DT: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')
 
model = RandomForestRegressor().fit(x_train, y_train)
print(f'RF: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = GradientBoostingRegressor().fit(x_train, y_train)
print(f'GBR: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = KNeighborsRegressor(n_neighbors=5).fit(x_train, y_train)
print(f'KNR: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = LinearRegression().fit(x_train, y_train)
print(f'Linear: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = XGBRegressor().fit(x_train, y_train)
print(f'XGB: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

model = LGBMRegressor().fit(x_train, y_train)
print(f'LGBM: Train 데이터 점수: {model.score(x_train, y_train)}, Val 데이터 점수: {model.score(x_test, y_test)}')

Ridge: Train 데이터 점수: 0.8641017638326064, Val 데이터 점수: 0.8059839531882603
Lasso: Train 데이터 점수: 0.8658686724666305, Val 데이터 점수: 0.8095835720321577
DT: Train 데이터 점수: 1.0, Val 데이터 점수: 0.5346657224934208
RF: Train 데이터 점수: 0.973159273173338, Val 데이터 점수: 0.7473022969023082
GBR: Train 데이터 점수: 0.9924930393666737, Val 데이터 점수: 0.6680128371057553
KNR: Train 데이터 점수: 0.8519574669303234, Val 데이터 점수: 0.7141598866992105
Linear: Train 데이터 점수: 0.8660014872647775, Val 데이터 점수: 0.8166067602935926
XGB: Train 데이터 점수: 0.9999999987317032, Val 데이터 점수: 0.6865729233842979
LGBM: Train 데이터 점수: 0.9007730007239413, Val 데이터 점수: 0.7642398373471202


#### Linear 모델의 val 데이터 점수가 타 모델에 비해 가장 높은 모습을 보여줌

# 선정한 Linear 모델의 RMSE 결과 확인

In [122]:
model = LinearRegression().fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_squared_error(y_pred, y_test)**0.5)

293.7459661364062


# Test_data로 제출파일 생성

In [123]:
# 제출파일 미리 불러오기
submit_df = pd.read_csv("./sample_submission.csv")

# testdata도 기존 방법과 같이 똑같이 전처리 진행 후 predict
sub = pd.get_dummies(test_df[['Species', 'Island', 'Sex']])
data_t = pd.concat([sub,test_df.iloc[:,4:7], test_df.iloc[:,8:]], axis=1)

data_t['Delta 15 N (o/oo)'].fillna(data_t['Delta 15 N (o/oo)'].mean(), inplace=True)
data_t['Delta 13 C (o/oo)'].fillna(data_t['Delta 13 C (o/oo)'].mean(), inplace=True)

In [124]:
data_t

Unnamed: 0,Species_Adelie Penguin (Pygoscelis adeliae),Species_Chinstrap penguin (Pygoscelis antarctica),Species_Gentoo penguin (Pygoscelis papua),Island_Biscoe,Island_Dream,Island_Torgersen,Sex_FEMALE,Sex_MALE,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Delta 15 N (o/oo),Delta 13 C (o/oo)
0,0,1,0,0,1,0,0,1,52.0,20.7,210.0,9.43146,-24.68440
1,0,0,1,1,0,0,0,1,55.9,17.0,228.0,8.31180,-26.35425
2,1,0,0,0,1,0,1,0,38.9,18.8,190.0,8.36936,-26.11199
3,0,1,0,0,1,0,1,0,45.2,16.6,191.0,9.62357,-24.78984
4,1,0,0,1,0,0,1,0,37.9,18.6,172.0,8.38404,-25.19837
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,0,1,0,0,1,0,0,1,49.3,19.9,203.0,9.88809,-24.59513
224,0,0,1,1,0,0,1,0,46.5,14.8,217.0,8.58487,-26.59290
225,0,0,1,1,0,0,1,0,46.5,13.5,210.0,7.99530,-25.32829
226,0,1,0,0,1,0,0,1,50.5,19.6,201.0,9.80590,-24.72940


In [125]:
# Test 데이터 예측 전 열 최종 확인
print(data_t.isnull().sum())

# test 데이터 예측
test_pred = model.predict(data_t)

Species_Adelie Penguin (Pygoscelis adeliae)          0
Species_Chinstrap penguin (Pygoscelis antarctica)    0
Species_Gentoo penguin (Pygoscelis papua)            0
Island_Biscoe                                        0
Island_Dream                                         0
Island_Torgersen                                     0
Sex_FEMALE                                           0
Sex_MALE                                             0
Culmen Length (mm)                                   0
Culmen Depth (mm)                                    0
Flipper Length (mm)                                  0
Delta 15 N (o/oo)                                    0
Delta 13 C (o/oo)                                    0
dtype: int64


In [126]:
# 제출
submit_df['Body Mass (g)'] = test_pred
submit_df.to_csv("./sample_submission_저장.csv", index=False)