In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold

In [3]:
df = pd.read_csv('2013-2018년_가스공급량과_기온2.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,연월일,시간,구분,공급량,year,month,day,weekday,기온
0,0,0,2013-01-01,1,0,2497.129,2013,1,1,1,-6.4
1,1,1,2013-01-01,1,1,2169.093,2013,1,1,1,-6.4
2,2,2,2013-01-01,1,2,226.178,2013,1,1,1,-6.4
3,3,3,2013-01-01,1,3,1434.516,2013,1,1,1,-6.4
4,4,4,2013-01-01,1,4,3272.837,2013,1,1,1,-6.4


In [4]:
# col 선택 : year, month, day, 시간, 구분, 기온
# 타겟 : 공급량

# 데이터 확인 

In [5]:
col = ['year', 'month', 'day', '시간', '기온', '구분', '공급량']
df_col = df[col]
df_col.head()

Unnamed: 0,year,month,day,시간,기온,구분,공급량
0,2013,1,1,1,-6.4,0,2497.129
1,2013,1,1,1,-6.4,1,2169.093
2,2013,1,1,1,-6.4,2,226.178
3,2013,1,1,1,-6.4,3,1434.516
4,2013,1,1,1,-6.4,4,3272.837


# 모델 평가

In [6]:
col = ['month', '구분', '시간']
X = df[col]
y = df['공급량']

### 데이터 분할

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((276066, 3), (92022, 3), (276066,), (92022,))

### 선형회귀 모델 구축

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

### 교차 검증

In [9]:
kfold = KFold(shuffle=True)
scores = cross_val_score(model, X_test, y_test, cv=kfold, scoring='r2')

print(f"훈련 결정계수 = {model.score(X_train, y_train)}")
print(f"테스트 결정계수 = {model.score(X_test, y_test)}")
print(f"결정계수 평균 = {scores.mean()}")

훈련 결정계수 = 0.03674484183915494
테스트 결정계수 = 0.03766013631444798
결정계수 평균 = 0.037429753622295704


### Lasso 모델 구축 및 평가

In [10]:
from sklearn.linear_model import Lasso

In [11]:
lasso = Lasso(alpha=0.01, random_state=0)
lasso.fit(X_train, y_train)

kfold = KFold(shuffle=True)
scores = cross_val_score(lasso, X_test, y_test, cv=kfold, scoring='r2')

print(f"훈련 결정계수 = {lasso.score(X_train, y_train)}")
print(f"테스트 결정계수 = {lasso.score(X_test, y_test)}")
print(f"결정계수 평균 = {scores.mean()}")

훈련 결정계수 = 0.036744841797822225
테스트 결정계수 = 0.037660096922887965
결정계수 평균 = 0.03758034031209308


### Ridge 모델 구축 및 평가

In [12]:
from sklearn.linear_model import Ridge

In [13]:
ridge = Ridge(alpha=0.01, random_state=0)
ridge.fit(X_train, y_train)

kfold = KFold(shuffle=True)
scores = cross_val_score(ridge, X_test, y_test, cv=kfold, scoring='r2')

print(f"훈련 결정계수 = {ridge.score(X_train, y_train)}")
print(f"테스트 결정계수 = {ridge.score(X_test, y_test)}")
print(f"결정계수 평균 = {scores.mean()}")

훈련 결정계수 = 0.03674484183915494
테스트 결정계수 = 0.03766013630974652
결정계수 평균 = 0.03758092041837313


# 기온 데이터 포함 모델 평가

### 데이터 정규화, 표준화

In [14]:
### 정규화, 표준화
col = ['month', '시간', '구분', '기온']
X = df[col]
y = df['공급량']

# 정규화
mmscaler = MinMaxScaler()
m_X = mmscaler.fit_transform(X)
# 표준화
sscaler = StandardScaler()
s_m_X = sscaler.fit_transform(m_X)

### 데이터 분할

In [15]:
X_train, X_test, y_train, y_test = train_test_split(s_m_X, y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((276066, 4), (92022, 4), (276066,), (92022,))

In [16]:
X_train[:10]

array([[ 1.00800907, -0.79454663, -1.5       , -0.44903736],
       [-0.73191835,  0.07223151,  1.5       ,  0.35597757],
       [ 0.13804536,  1.2279357 , -1.5       ,  1.02682335],
       [ 0.13804536,  0.79454663, -0.5       ,  1.11307495],
       [-0.73191835,  1.37239873, -0.5       , -0.32445172],
       [-0.73191835,  0.79454663, -1.5       , -0.51612194],
       [ 0.13804536, -0.50562058,  0.5       ,  1.03640686],
       [ 1.58798488,  0.65008361,  0.        , -0.8323778 ],
       [-1.02190626, -1.37239873,  1.5       , -1.18696772],
       [ 1.00800907, -1.2279357 ,  1.        ,  0.10680628]])

### 선형회귀 모델 구축

In [17]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

### 교차 검증

In [18]:
kfold = KFold(shuffle=True)
scores = cross_val_score(model, X_test, y_test, cv=kfold, scoring='r2')

print(f"훈련 결정계수 = {model.score(X_train, y_train)}")
print(f"테스트 결정계수 = {model.score(X_test, y_test)}")
print(f"결정계수 평균 = {scores.mean()}")

훈련 결정계수 = 0.26871582014508144
테스트 결정계수 = 0.2669328616443105
결정계수 평균 = 0.2667243233333138


### Lasso 모델 구축 및 평가

In [19]:
lasso = Lasso(alpha=0.01, random_state=0)
lasso.fit(X_train, y_train)

kfold = KFold(shuffle=True)
scores = cross_val_score(lasso, X_test, y_test, cv=kfold, scoring='r2')

print(f"훈련 결정계수 = {lasso.score(X_train, y_train)}")
print(f"테스트 결정계수 = {lasso.score(X_test, y_test)}")
print(f"결정계수 평균 = {scores.mean()}")

훈련 결정계수 = 0.26871581969393865
테스트 결정계수 = 0.26693281459126084
결정계수 평균 = 0.2665266806476183


### Ridge 모델 구축 및 평가

In [20]:
ridge = Ridge(alpha=0.01, random_state=0)
ridge.fit(X_train, y_train)

kfold = KFold(shuffle=True)
scores = cross_val_score(ridge, X_test, y_test, cv=kfold, scoring='r2')

print(f"훈련 결정계수 = {ridge.score(X_train, y_train)}")
print(f"테스트 결정계수 = {ridge.score(X_test, y_test)}")
print(f"결정계수 평균 = {scores.mean()}")

훈련 결정계수 = 0.2687158201450809
테스트 결정계수 = 0.2669328617160599
결정계수 평균 = 0.26658412126434766
