### 선형모델 및 Lasso, Ridge 모델 만들고 적용

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [5]:
import pandas as pd

train = pd.read_csv("../data/parking_demand/train_df_errno.csv")
test = pd.read_csv("../data/parking_demand/test_df.csv")
sub = pd.read_csv("../data/parking_demand/sample_submission.csv")
age = pd.read_csv("../data/parking_demand/age_gender_info.csv")

train.shape, test.shape, sub.shape, age.shape

((2896, 15), (1008, 14), (150, 2), (16, 23))

In [10]:
test.isnull().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         180
임대료           180
10분내지하철수       38
10분내버스정류장수      0
단지내주차면수         0
dtype: int64

In [7]:
train.columns, test.columns

(Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수', '등록차량수'],
       dtype='object'),
 Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수'],
       dtype='object'))

In [8]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000,82940,0.0,3.0,624.0,205.0
1,C2515,545,아파트,경상남도,국민임대,39.6,60,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
2,C2515,545,아파트,경상남도,국민임대,39.6,20,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
3,C2515,545,아파트,경상남도,국민임대,46.9,38,17.0,A,18433000,149760,0.0,3.0,624.0,205.0
4,C2515,545,아파트,경상남도,국민임대,46.9,19,17.0,A,18433000,149760,0.0,3.0,624.0,205.0


### 단순선형 회귀 모델

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [24]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '단지내주차면수']
X = train[sel]
y = train['등록차량수']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### 모델 만들기

In [25]:
model = LinearRegression()  # 모델 생성
model.fit(X_train, y_train) # 모델 훈련
pred = model.predict(X_test) # 새로운 데이터로 예측

In [26]:
model.coef_ , model.intercept_

(array([-0.15912372,  0.48962892,  0.0719852 , -6.98404745,  1.09596934]),
 100.64601235358606)

### 모델 평가하기

In [27]:
len(pred)

724

In [28]:
import numpy as np

In [29]:
# mae, mse, rmse
mae_val = np.sum( abs( y_test - pred ) ) / len(pred)
mae_val

149.1293341160225

In [30]:
np.mean( abs( y_test - pred )  )

149.12933411602245

### MSE

In [31]:
mse_val = np.sum( ( y_test - pred )**2 ) / len(pred)
print(mse_val)
mse_val = np.mean( ( y_test - pred )**2  )
print(mse_val)

42701.70133960169
42701.70133960167


### RMSE

In [34]:
# 300 -> 206
rmse_val = mse_val ** 0.5
print( rmse_val )
rmse_val = np.sqrt(mse_val)
print( rmse_val )

206.64389983641342
206.64389983641342


### 피처수 늘리기

In [36]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [38]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '단지내주차면수']
X = train[sel]
y = train['등록차량수']

nor_X = MinMaxScaler().fit_transform(X)  # 입력 데이터 정규화
ex_X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(nor_X)  # 데이터 feature 추가 생성

X_train, X_test, y_train, y_test = train_test_split(ex_X, y, random_state=0)


In [39]:
X_train.shape

(2172, 20)

### LASSO 모델 , Ridge모델 적용하기

In [40]:
from sklearn.linear_model import Lasso, Ridge

In [85]:
model = Lasso(alpha=0.01)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred[0:10]

array([2034.15114024,  368.91598673,  824.17931433,  496.8976281 ,
        121.24109349,  742.33586207,  438.66678347,  291.25489882,
        627.84464153,  432.58897793])

### 평가하기(mae, mse, rmse)

In [86]:
mae_val = np.mean( abs( y_test - pred ) )
print( mae_val )
mse_val = np.mean( (y_test - pred) **2 )
print( mae_val )
rmse_val = mse_val ** 0.5
print( rmse_val )

130.31797654776878
130.31797654776878
185.11148895612024


In [87]:
model.coef_

array([   -7.4164862 ,  1105.73814396,  -318.41251841,   332.2013695 ,
        1599.7262393 ,  -531.41966601, -2057.75867057,  1397.83903698,
         280.18587408,  1060.37516225,  -356.24185484,  6135.54525747,
         372.37258238,  3474.8195112 , -1223.70924978,   312.08301805,
        -161.24172662,  -325.7474799 , -1524.67294704,  -224.01856956])

In [84]:
model = Ridge(alpha=0.01)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred[0:10]

array([2017.26924206,  368.42107124,  821.85083007,  497.6633284 ,
        119.62607584,  743.81368402,  437.08916815,  290.93181935,
        632.97896544,  433.76306719])

In [79]:
mae_val = np.mean( abs( y_test - pred ) )
print( mae_val )
mse_val = np.mean( (y_test - pred) **2 )
print( mae_val )
rmse_val = mse_val ** 0.5
print( rmse_val )

130.5773538490838
130.5773538490838
185.4384395420726


In [80]:
model.coef_

array([  -10.1263555 ,  1190.20115145,  -187.42652318,   330.12348095,
        1604.45785474,  -533.83686613, -2076.54678837,  1329.63865233,
         293.55663998,  1077.94559216,  -452.95657097,  3604.89611386,
         439.68006605,  3542.38639166, -1267.2875676 ,   346.19657   ,
        -101.5081487 ,  -330.71549027, -1541.75256794,  -241.13484863])

In [88]:
a = [1,2,3]


In [89]:
s = pd.Series([1, 2], index=["a", "b"])

In [90]:
s_copy = s.copy()

In [91]:
s = pd.Series([1, 2], index=["a", "b"])

In [95]:
deep = s.copy()
shallow = s.copy(deep=False)
deep1 = s.copy(deep=True)

In [93]:
s is shallow

False

In [94]:
s is deep

False

In [99]:
s.index

Index(['a', 'b'], dtype='object')

In [100]:
s.values

array([1, 2], dtype=int64)

In [97]:
s.values is shallow.values and s.index is shallow.index

True

In [96]:
s is deep1

False

In [98]:
s.values is deep.values or s.index is deep.index

False