# **1. 단순 선형 회귀 분석**
- 전복의 나이를 예측하는 선형회귀모델을 생성하세요.
- 전복의 ‘성별’, ‘키’, ‘지름’, ‘높이’, ‘전체무게’, ‘몸통무게’, ‘내장무게’, ‘껍질무게’를 이용해 ‘껍질의 고리 수’를 예측한 뒤, **예측된 ‘껍질의 고리 수’에 1.5를 더하면 전복의 나이**가 됩니다.

In [1]:
# 기본 모듈 불러오기
import numpy as np
import pandas as pd

**1) 데이터 load 및 변형** 

In [31]:
# 데이터 로드
data = pd.read_csv("/content/sample_data/abalone.csv")
data.head()
print(data.shape)

# 성별 M은 Male, F는 Female, I는 Infant 이므로 따로 열 만들기
for label in "MFI":
    data[label] = data["Sex"] == label
data.drop('Sex', axis=1, inplace=True)

(4177, 9)


In [32]:
data

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,M,F,I
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,True,False,False
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,True,False,False
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,False,True,False
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,True,False,False
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,False,True,False
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,True,False,False
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,True,False,False
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,False,True,False


**2) X, y 선택**
: y는 Rings열, X는 Rings열을 제외한 나머지를 선택하되 전부 실수가 되도록 한다.

In [33]:
data = pd.get_dummies(data=data, columns=['M','F','I'], prefix=['M','F','I'])

In [34]:
# X,y 데이터 선택
y = data['Rings']

X = data.drop('Rings', axis=1)
# X = x.astype(float).values -> 실수가 되게 하기

 **3) train/test set 분리**

In [36]:
# 필요한 모듈 불러오기
from sklearn.model_selection import train_test_split

In [37]:
# train과 test set 분리 (train:test = 7:3 비율로)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

**4) 선형회귀모델 생성, 모델 예측치 구하기**

In [38]:
#필요한 모듈 불러오기
from sklearn.linear_model import LinearRegression

In [39]:
#선형회귀모델 생성 및 훈련
linear = LinearRegression()
linear.fit(X_train,y_train)

In [41]:
# 모델 예측치 구하기
linear_pred = linear.predict(X_test)
# 모델 예측치를 활용해 최종적으로 전복의 나이를 예측
age_pred = linear_pred+1.5
age_pred

array([16.26953125, 13.04296875,  8.00390625, ..., 10.59765625,
       11.96484375, 14.44140625])

**5) 모델 평가: MSE, RMSE, R2 score, corr 구하기**

In [43]:
#필요한 모듈 불러오기
from sklearn.metrics import mean_squared_error  # mse
from sklearn.metrics import r2_score  # r2 score

- MSE, RMSE 

In [72]:
#mse, rmse
mean_squared_error(age_pred, y_test)  #mse

7.091669913113972

In [45]:
np.sqrt(mean_squared_error(age_pred, y_test)) #rmse

2.6630189471939496

- R2 score

In [47]:
#R2 score 측정
from sklearn.metrics import r2_score
r2_score(age_pred, y_test)

-0.42530119586366966

- 회귀 절편값

In [48]:
#회귀 절편 값
linear.intercept_

-34541929849280.395

- 회귀 계수 값

In [50]:
#회귀 계수 값
linear.coef_

array([ 1.00953197e-01,  1.10547289e+01,  8.29404604e+00,  8.78571648e+00,
       -1.91345393e+01, -1.08799529e+01,  8.65571919e+00,  4.22982678e+13,
       -4.05857956e+13,  3.79800181e+13, -4.49040453e+13,  3.71477074e+13,
       -4.57363560e+13])

- 상관계수

Hint: corr 함수 이용.

In [53]:
# 상관계수 구하기
data.corr()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,M_False,M_True,F_False,F_True,I_False,I_True
Length,1.0,0.986812,0.827554,0.925261,0.897914,0.903018,0.897706,0.55672,-0.236543,0.236543,-0.309666,0.309666,0.551465,-0.551465
Diameter,0.986812,1.0,0.833684,0.925452,0.893162,0.899724,0.90533,0.57466,-0.240376,0.240376,-0.318626,0.318626,0.564315,-0.564315
Height,0.827554,0.833684,1.0,0.819221,0.774972,0.798319,0.817338,0.557467,-0.215459,0.215459,-0.298421,0.298421,0.518552,-0.518552
Whole weight,0.925261,0.925452,0.819221,1.0,0.969405,0.966375,0.955355,0.54039,-0.252038,0.252038,-0.299741,0.299741,0.557592,-0.557592
Shucked weight,0.897914,0.893162,0.774972,0.969405,1.0,0.931961,0.882617,0.420884,-0.251793,0.251793,-0.263991,0.263991,0.521842,-0.521842
Viscera weight,0.903018,0.899724,0.798319,0.966375,0.931961,1.0,0.907656,0.503819,-0.242194,0.242194,-0.308444,0.308444,0.556081,-0.556081
Shell weight,0.897706,0.90533,0.817338,0.955355,0.882617,0.907656,1.0,0.627574,-0.235391,0.235391,-0.306319,0.306319,0.546953,-0.546953
Rings,0.55672,0.57466,0.557467,0.54039,0.420884,0.503819,0.627574,1.0,-0.181831,0.181831,-0.250279,0.250279,0.436063,-0.436063
M_False,-0.236543,-0.240376,-0.215459,-0.252038,-0.251793,-0.242194,-0.235391,-0.181831,1.0,-1.0,-0.512528,0.512528,-0.522541,0.522541
M_True,0.236543,0.240376,0.215459,0.252038,0.251793,0.242194,0.235391,0.181831,-1.0,1.0,0.512528,-0.512528,0.522541,-0.522541


# **2. Polynomial features**

In [57]:
# PolynomialFeatures 라이브러리 호출
from sklearn.preprocessing import PolynomialFeatures

In [58]:
# 임의 데이터 생성

X = np.arange(6).reshape(3, 2)

df =  pd.DataFrame(X)
df.columns = ['x_1','x_2']
df

Unnamed: 0,x_1,x_2
0,0,1
1,2,3
2,4,5


In [67]:
# 차원은 2로 설정
# fit_transform 메소드를 통해 데이터 변환
# PolynomialFeatures로 변환 된 데이터를 데이터 프레임 형태로 변환

poly_features = PolynomialFeatures(degree=2)
df_poly = poly_features.fit_transform(X)
df_poly = pd.DataFrame(df_poly)

from sklearn.linear_model import LinearRegression

polylin = LinearRegression()
polylin.fit(df_poly, X)

In [69]:
# df_poly의 컬럼을 1,x1,x2,x1^2,x1*x2,x2^2 로 변경
df_poly.columns = ['1','x1','x2','x1^2','x1*x2','x2^2']

In [70]:
df_poly

Unnamed: 0,1,x1,x2,x1^2,x1*x2,x2^2
0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,2.0,3.0,4.0,6.0,9.0
2,1.0,4.0,5.0,16.0,20.0,25.0
