## overfitting

In [34]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt


In [5]:
df = pd.read_csv('data/admission_data.csv')
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [22]:
X = df.drop(['Chance of Admit '],axis=1)
poly_transfer = PolynomialFeatures(6)
poly_features = poly_transfer.fit_transform(X.values)
features = poly_transfer.get_feature_names(X.columns)

X = pd.DataFrame(poly_features,columns=features)
X.head()

Unnamed: 0,1,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial No.^2,...,LOR CGPA^2 Research^3,LOR CGPA Research^4,LOR Research^5,CGPA^6,CGPA^5 Research,CGPA^4 Research^2,CGPA^3 Research^3,CGPA^2 Research^4,CGPA Research^5,Research^6
0,1.0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,1.0,...,419.05125,43.425,4.5,807539.696082,83682.87006,8671.800006,898.632125,93.1225,9.65,1.0
1,1.0,2.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,4.0,...,354.04605,39.915,4.5,487014.306256,54905.784245,6190.054594,697.864103,78.6769,8.87,1.0
2,1.0,3.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,9.0,...,224.0,28.0,3.5,262144.0,32768.0,4096.0,512.0,64.0,8.0,1.0
3,1.0,4.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,16.0,...,187.92225,21.675,2.5,424731.61094,48988.651781,5650.363527,651.714363,75.1689,8.67,1.0
4,1.0,5.0,314.0,103.0,2.0,2.0,3.0,8.21,0.0,25.0,...,0.0,0.0,0.0,306237.903347,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
y = df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [24]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 5)

In [25]:
model = LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [26]:
y_train_predict = model.predict(x_train)
y_test_predict = model.predict(x_test)

In [28]:
mse_train = mean_squared_error(y_train,y_train_predict)
print(mse_train**0.5)
mse_test = mean_squared_error(y_test,y_test_predict)
print(mse_test**0.5) # test에서는 성능이 떨어짐(과적합)

9.201674312824392e-10
0.9952571532717315


## L1 정규화

In [32]:
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [35]:
model = Lasso(alpha=0.001, max_iter=1000,normalize=True)
# L2 정규화 하고 싶으면 Lasso --> Ridge

model.fit(x_train,y_train)

Lasso(alpha=0.001, normalize=True)

In [36]:
y_train_predict = model.predict(x_train)
y_test_predict = model.predict(x_test)

In [39]:
mse_train = mean_squared_error(y_train,y_train_predict)
print(mse_train**0.5)
mse_test = mean_squared_error(y_test,y_test_predict)
print(mse_test**0.5) # test에서도 성능이 비슷함 --> 모델이 과적합 되는 것을 막을 수 있음

0.06329575019490115
0.06000348571173352
