In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
from sklearn.datasets import make_regression
X,y = make_regression(n_features=1,noise=10,n_samples=1000, random_state=42)

In [None]:
X[:5], y[:5]

In [None]:
plt.xlabel('Feature - X')
plt.ylabel('Target - Y')
plt.scatter(X,y,s=5)

In [None]:
lr = LinearRegression()
lr.fit(X,y)
lr.coef_,lr.intercept_

In [None]:
y_pred = lr.predict(X)

- 파랑이 훈련 데이터
- 주황이 예측 데이터

In [None]:
plt.scatter(X,y,s=5,label='training')
plt.scatter(X,y_pred,s=5,label='prediction')
plt.xlabel('Feature - X')
plt.ylabel('Target - Y')
plt.legend()
plt.show()

# Ridge Regression

- Ridge는 w가 클수록 페널티 부과
- 아웃라이어에 영향을 덜 받는다

In [None]:
outliers = y[950:] - 600; outliers

In [None]:
y_out = np.append(y[:950], outliers)

In [None]:
plt.scatter(X,y_out,s=5)

In [None]:
lr = LinearRegression()
lr.fit(X,y_out)
y_out_pred = lr.predict(X)

In [None]:
plt.scatter(X,y_out,s=5,label='actual')
plt.scatter(X,y_out_pred,s=5,label='prediction with outlier')
plt.scatter(X,y_pred,s=5,c='k',label='prediction without outlier')
plt.legend()
plt.title('Linear Regression')

In [None]:
lr.coef_, lr.intercept_

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1000)
ridge.fit(X,y_out)
y_ridge_pred = ridge.predict(X)

In [None]:
plt.scatter(X,y_out, s=5, label='actual')
plt.scatter(X,y_out_pred,s=5,c='r', label='LinearRegression with outlier')
plt.scatter(X,y_ridge_pred,s=5,c='k', label='RidgeRegression with outlier')
plt.legend()
plt.title('Linear Regression')

In [None]:
ridge.coef_, ridge.intercept_ # w값이 훨씬 작아짐

In [None]:
lasso = Lasso(alpha=10)
lasso.fit(X,y_out)
y_lasso_pred = lasso.predict(X)

In [None]:
plt.scatter(X,y_out,s=5,label='actual')
plt.scatter(X,y_out_pred,s=5,c='r',label='LinearRegression')
plt.scatter(X,y_ridge_pred,s=5,c='k',label='RidgeRegression')
plt.scatter(X,y_lasso_pred,s=5,c='y',label='LassoRegression')
plt.legend()
plt.title('Linear Regression')

In [None]:
lasso.coef_,lasso.intercept_

# Ridge에서 w에 미치는 알파의 영향

In [None]:
X,y,w = make_regression(n_samples=1000,n_features=10,coef=True,
                        random_state = 42, bias=3.5)

In [None]:
w

- 알파를 변화시키며 ridge 학습
    - 계산된 w 리스트 추가
    - [10^-6 ~ 10^6] 범위에서 알파 20개 생성

In [None]:
alphas = np.logspace(-3,5,200)
alphas[:20], alphas[-20:]

In [None]:
coefs = []
for a in alphas:
    ridge = Ridge(alpha = a)
    ridge.fit(X,y)
    coefs.append(ridge.coef_)

In [None]:
w

In [None]:
coefs[:5]

In [None]:
coefs[-5:] # w 값이 굉장히 작아짐

- 알파 & w 그래프화

In [None]:
ax = plt.gca()

ax.plot(alphas,coefs)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.show()

# Lasso

In [None]:
X,y,w = make_regression(n_samples=1000,n_features=10,coef=True,
                        random_state=42,bias=3.5)
alphas = np.logspace(-3,5,200)
coefs = []
for a in alphas:
    lasso = Lasso(max_iter=10000,alpha = a)
    lasso.fit(X,y)
    coefs.append(lasso.coef_)

ax = plt.gca()

ax.plot(alphas,coefs)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.show()

# 예제 - 인디언 슈퍼마켓

In [None]:
# data to train and test
!curl -L https://bit.ly/2HsDP2p -o train.csv
!curl -L https://bit.ly/2WgqtdO -o test.csv
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df['Item_Fat_Content'].value_counts()

In [None]:
test_df['Item_Fat_Content'].value_counts()

In [None]:
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].str.lower()
test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].str.lower()
test_df['Item_Fat_Content'].replace(["lf","reg"], ["low fat","regular"], inplace=True)
train_df['Item_Fat_Content'].replace(["lf","reg"], ["low fat","regular"], inplace=True)

In [None]:
train_df['Item_Fat_Content'].value_counts()

In [None]:
test_df['Item_Fat_Content'].value_counts()

In [None]:
train_df.head().T

In [None]:
test_df.head().T

In [None]:
train_df.info()

In [None]:
X,y = train_df.loc[:,['Outlet_Establishment_Year','Item_MRP']], train_df['Item_Outlet_Sales']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
lreg = LinearRegression()

np.random.seed(7)
x_train, x_test,y_train,y_test = train_test_split(X,y, test_size=0.3)
lreg.fit(x_train,y_train)
pred = lreg.predict(x_test)

mse = np.mean((pred - y_test)**2)

print(mse,mse**(0.5)) # MSE , RMSE
print(lreg.score(x_test,y_test)) # R2-SCORE

In [None]:
lreg.coef_, x_train.columns

In [None]:
# 선형모델 계수 출력
coeff = pd.DataFrame(x_train.columns)
coeff['Coefficient Estimate'] = pd.Series(lreg.coef_)
coeff

In [None]:
plt.subplot(1,2,1)
plt.scatter(train_df.Item_MRP, train_df.Item_Outlet_Sales, s=0.1)
plt.subplot(1,2,2)
plt.scatter(train_df.Outlet_Establishment_Year, train_df.Item_Outlet_Sales, s=0.1)

In [None]:
train_df.isnull().sum()

In [None]:
train_df['Item_Weight'].fillna(train_df['Item_Weight'].mean(), inplace=True)

In [None]:
train_df.Item_Visibility.hist(bins=100)

In [None]:
train_df['Item_Visibility'] = train_df['Item_Visibility'].replace(0,np.mean(train_df['Item_Visibility']))

In [None]:
train_df.Item_Visibility.hist(bins=100)

In [None]:
train_df.Outlet_Establishment_Year.value_counts()

In [None]:
train_df['Outlet_Establishment_Year'] = 2013 - train_df['Outlet_Establishment_Year']

In [None]:
train_df.Outlet_Establishment_Year.hist(bins=50)

In [None]:
train_df.Outlet_Size.isnull().sum()

In [None]:
train_df.Outlet_Size.value_counts()

In [None]:
# 결측치 small 매장으로 대체
train_df['Outlet_Size'].fillna('Small',inplace=True)

In [None]:
train_df.Outlet_Size.value_counts()

In [None]:
train_df.isnull().sum()

In [None]:
# 상품명은 분석에서 제외한다.
train_df.drop("Item_Identifier", axis=1, inplace=True)

In [None]:
train_df.dtypes

In [None]:
object_list = list(train_df.select_dtypes(include=['object']).columns)

In [None]:
object_list

In [None]:
dummies = pd.get_dummies(train_df[object_list], prefix=object_list)

In [None]:
dummies.columns

In [None]:
X1 = train_df.drop(object_list, axis=1)

In [None]:
X1,y = X1.drop('Item_Outlet_Sales',1), X1.Item_Outlet_Sales

In [None]:
X1

In [None]:
col = X1.columns; col

In [None]:
# scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X1)

In [None]:
X = pd.concat([pd.DataFrame(X1,columns=col),dummies],axis=1)

In [None]:
dummies.shape, X1.shape, X.shape

In [None]:
np.random.seed(7)
x_train,x_test,y_train,y_test = train_test_split(X1,y,test_size=0.3)
lreg.fit(x_train,y_train)
pred_test = lreg.predict(x_test)
mse = np.mean((pred_test - y_test)**2)
print(mse**0.5)
print(lreg.score(x_test,y_test))

In [None]:
x_plot = plt.scatter(pred_test, (pred_test - y_test), c='b',s=1)
plt.hlines(y=0,xmin=-1000,xmax=5000)
plt.title('Residual plot')

# Ridge regularizer

In [None]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

ridgeReg = make_pipeline(StandardScaler(),Ridge(alpha=0.05))

ridgeReg.fit(x_train,y_train)
pred = ridgeReg.predict(x_test)
mse = np.mean((pred_test - y_test)**2)
mse**0.5, ridgeReg.score(x_test,y_test)

In [None]:
from sklearn.linear_model import SGDRegressor, LinearRegression,Ridge,Lasso
from sklearn.preprocessing import PolynomialFeatures

In [None]:
x = np.linspace(0,4,50)
y = 2.5 * np.exp(-1.3*x) + 0.5
y += 0.2 * np.random.normal(size=x.size)

lin = LinearRegression()
lin.fit(x.reshape(-1,1),y)
a,b = lin.coef_,lin.intercept_
xx = np.linspace(0.,4.,50)
yy = a * xx + b
plt.scatter(x,y)
plt.plot(xx,yy,c='r')

In [None]:
poly_f = PolynomialFeatures(degree = 27, include_bias = False)
x_poly = poly_f.fit_transform(x.reshape(-1,1))

lin = LinearRegression()
lin.fit(x_poly,y)
y_pred_lin = lin.predict(x_poly)

ridge_ = Ridge(alpha=10)
ridge_.fit(x_poly,y)
y_pred_ridge = ridge_.predict(x_poly)

plt.scatter(x,y,s=10)
plt.plot(x,y_pred_lin,c='r')
plt.plot(x,y_pred_ridge,c='k')
plt.legend(['linear','ridge'])
plt.plot()