In [1]:
import numpy as np
import pandas as pd

import sklearn
import torch

In [2]:
print('numpy: ', np.__version__)
print('pandas: ', pd.__version__)
print('scikit-learn: ', sklearn.__version__)
print('pytorch: ', torch.__version__)

numpy:  1.20.3
pandas:  1.3.2
scikit-learn:  0.24.2
pytorch:  1.9.0


In [3]:
train_breast = pd.read_csv('./train_breast.csv')
train_intestine = pd.read_csv('./train_intestine.csv')
train_lung = pd.read_csv('./train_lung.csv')

test_breast = pd.read_csv('./val_breast.csv')
test_intestine = pd.read_csv('./val_intestine.csv')
test_lung = pd.read_csv('./val_lung.csv')

In [4]:
train_breast.head()

Unnamed: 0,No,AGE,inductal carcinoma,infiltrating duct carcinoma,adenocarcinoma,lobular carcinoma,Metaplastic carcinoma,TX,T0,T1,...,PR,AR,BCS,Mestectomy,pan hysterosalpingo oophorectomy,Chemotherapy,Hormone therapy,Radiation Therapy,Death,Survival period
0,0,33,0,1,0,0,1,0,0,1,...,2,9,1,99,0,1,0,1,0,472
1,1,72,0,1,1,1,0,0,0,0,...,1,99,1,0,0,0,1,1,0,835
2,2,41,0,0,0,0,0,0,0,0,...,99,9,99,1,0,1,1,0,1,1136
3,3,61,1,0,0,1,0,0,0,1,...,1,99,99,0,0,0,1,1,1,145
4,4,44,0,1,0,0,0,0,0,0,...,2,99,0,99,0,0,1,1,1,906


# 바로 회귀하기

In [56]:
# 1st model: linear regression
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()

# 2nd model: Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
model2 = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())])

# 3rd model: Ridge regression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
model3= Pipeline([('poly', PolynomialFeatures(degree=2)),('Ridge', RidgeCV(alphas=(0.1, 1.0, 10.0), scoring='neg_mean_absolute_error'))]) # Ridge 중 좋은 alpha 자동 사용

# 4th model: RF regressor
from sklearn.ensemble import RandomForestRegressor
model4 = RandomForestRegressor()

# 5th model: GB regressor
from sklearn.ensemble import GradientBoostingRegressor
model5 = GradientBoostingRegressor()

# 6th model: SV regressor
from sklearn.svm import SVR
model6 = SVR()

In [24]:
from sklearn import metrics

In [25]:
X_train = train_breast.drop('Survival period', axis=1)
y_train = train_breast['Survival period']

X_test = test_breast.drop('Survival period', axis=1)
y_test = test_breast['Survival period']

# 1. 선형회귀

In [26]:
model1.fit(X_train, y_train)

LinearRegression()

In [27]:
y1_pred = model1.predict(X_test)

pd.DataFrame({'y_test': y_test, 'y1_pred': np.rint(y1_pred)})

Unnamed: 0,y_test,y1_pred
0,893,8.140000e+02
1,1090,8.290000e+02
2,1020,8.410000e+02
3,1239,8.460000e+02
4,935,8.940000e+02
...,...,...
4995,145,8.210000e+02
4996,1360,8.230000e+02
4997,67,8.850000e+02
4998,133,-1.176519e+11


In [28]:
MAE_1=metrics.mean_absolute_error(y_test , y1_pred)
MSE_1=metrics.mean_squared_error(y_test , y1_pred)
RMSE_1=np.sqrt(MSE_1)
pd.DataFrame([MAE_1,MSE_1,RMSE_1], index=['MAE_linear','MSE_linear','RMSE_linear'],columns=['Quantity'])

Unnamed: 0,Quantity
MAE_linear,28989420000.0
MSE_linear,3.41066e+21
RMSE_linear,58400860000.0


# 2. 단순다항회귀(2차)

3차는 overfitting되었고 4차는 컴퓨팅 파워 부족으로 계산이 안 됨. 

In [33]:
model2.fit(X_train, y_train)

Pipeline(steps=[('poly', PolynomialFeatures()), ('linear', LinearRegression())])

In [34]:
y2_pred = model2.predict(X_test)

pd.DataFrame({'y_test': y_test, 'y2_pred': np.rint(y2_pred)})

Unnamed: 0,y_test,y2_pred
0,893,780.0
1,1090,652.0
2,1020,952.0
3,1239,1089.0
4,935,1083.0
...,...,...
4995,145,728.0
4996,1360,681.0
4997,67,954.0
4998,133,972.0


In [44]:
MAE_2=metrics.mean_absolute_error(y_test , y2_pred)
MSE_2=metrics.mean_squared_error(y_test , y2_pred)
RMSE_2=np.sqrt(MSE_2)
pd.DataFrame([MAE_2,MSE_2,RMSE_2], index=['MAE_poly','MSE_poly','RMSE_poly'],columns=['Quantity'])

Unnamed: 0,Quantity
MAE_poly,416.024849
MSE_poly,252088.392979
RMSE_poly,502.08405


# 3. Ridge 2차다항회귀

In [38]:
model3.fit(X_train, y_train)

Pipeline(steps=[('poly', PolynomialFeatures()),
                ('Ridge',
                 RidgeCV(alphas=array([ 0.1,  1. , 10. ]),
                         scoring='neg_mean_absolute_error'))])

In [42]:
y3_pred = model3.predict(X_test)

pd.DataFrame({'y_test': y_test, 'y3_pred': np.rint(y3_pred)})

Unnamed: 0,y_test,y3_pred
0,893,1314.0
1,1090,1210.0
2,1020,1512.0
3,1239,1642.0
4,935,1634.0
...,...,...
4995,145,876.0
4996,1360,828.0
4997,67,1084.0
4998,133,1108.0


In [45]:
MAE_3=metrics.mean_absolute_error(y_test , y3_pred)
MSE_3=metrics.mean_squared_error(y_test , y3_pred)
RMSE_3=np.sqrt(MSE_3)
pd.DataFrame([MAE_3,MSE_3,RMSE_3], index=['MAE_ridge','MSE_ridge','RMSE_ridge'],columns=['Quantity'])

Unnamed: 0,Quantity
MAE_ridge,550.829707
MSE_ridge,450284.793633
RMSE_ridge,671.032632


# 4. Random Forest Regressor

In [47]:
model4.fit(X_train, y_train)

RandomForestRegressor()

In [48]:
y4_pred = model4.predict(X_test)

pd.DataFrame({'y_test': y_test, 'y4_pred': np.rint(y4_pred)})

Unnamed: 0,y_test,y4_pred
0,893,800.0
1,1090,825.0
2,1020,838.0
3,1239,805.0
4,935,831.0
...,...,...
4995,145,848.0
4996,1360,892.0
4997,67,892.0
4998,133,816.0


In [49]:
MAE_4=metrics.mean_absolute_error(y_test , y4_pred)
MSE_4=metrics.mean_squared_error(y_test , y4_pred)
RMSE_4=np.sqrt(MSE_4)
pd.DataFrame([MAE_4,MSE_4,RMSE_4], index=['MAE_RFR','MSE_RFR','RMSE_RFR'],columns=['Quantity'])

Unnamed: 0,Quantity
MAE_RFR,385.96746
MSE_RFR,213190.975874
RMSE_RFR,461.726083


# 5. Gradient Boosting Regressor

In [50]:
model5.fit(X_train, y_train)

GradientBoostingRegressor()

In [51]:
y5_pred = model5.predict(X_test)

pd.DataFrame({'y_test': y_test, 'y5_pred': np.rint(y5_pred)})

Unnamed: 0,y_test,y5_pred
0,893,600.0
1,1090,424.0
2,1020,901.0
3,1239,759.0
4,935,527.0
...,...,...
4995,145,857.0
4996,1360,855.0
4997,67,914.0
4998,133,833.0


In [53]:
MAE_5=metrics.mean_absolute_error(y_test , y5_pred)
MSE_5=metrics.mean_squared_error(y_test , y5_pred)
RMSE_5=np.sqrt(MSE_5)
pd.DataFrame([MAE_5,MSE_5,RMSE_5], index=['MAE_GBR','MSE_GBR','RMSE_GBR'],columns=['Quantity'])

Unnamed: 0,Quantity
MAE_GBR,384.993899
MSE_GBR,211759.224109
RMSE_GBR,460.173037


# 6. Support Vector Regressor

In [57]:
model6.fit(X_train, y_train)

SVR()

In [58]:
y6_pred = model6.predict(X_test)

pd.DataFrame({'y_test': y_test, 'y6_pred': np.rint(y6_pred)})

Unnamed: 0,y_test,y6_pred
0,893,873.0
1,1090,873.0
2,1020,873.0
3,1239,873.0
4,935,873.0
...,...,...
4995,145,873.0
4996,1360,873.0
4997,67,873.0
4998,133,873.0


In [59]:
MAE_6=metrics.mean_absolute_error(y_test , y6_pred)
MSE_6=metrics.mean_squared_error(y_test , y6_pred)
RMSE_6=np.sqrt(MSE_6)
pd.DataFrame([MAE_6,MSE_6,RMSE_6], index=['MAE_SVR','MSE_SVR','RMSE_SVR'],columns=['Quantity'])

Unnamed: 0,Quantity
MAE_SVR,383.674825
MSE_SVR,211422.080481
RMSE_SVR,459.806569
