# 영암군 Data

In [None]:
import numpy as np                                                 # 넘파이 패키지
import pandas as pd                                                 # 판다스 패키지
import matplotlib.pyplot as plt                                   # 그래프 패키지
from sklearn.model_selection import train_test_split       # 데이터 셋 분할

from sklearn.linear_model import LinearRegression        # 선형 회귀 패키지
from sklearn.linear_model import Ridge                       # 릿지 선형 회귀 패키지
from sklearn.preprocessing import PolynomialFeatures   # 특성 변환기 패키지

from sklearn.metrics import mean_absolute_error          # 평균 절대값 에러(모델 평가 지수)
from sklearn.metrics import mean_squared_error          # 평균 제곱근 에러(모델 평가 지수)

In [None]:
# 코랩과 구글 드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/mokpo_data.csv')
print(data.info())                       # 데이터 셋 정보
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30624 entries, 0 to 30623
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   기온      30624 non-null  float64
 1   풍속      30624 non-null  float64
 2   습도      30624 non-null  int64  
 3   기압      30624 non-null  float64
 4   일조      30624 non-null  float64
 5   일사      30624 non-null  float64
 6   전운량     30624 non-null  int64  
 7   발전량     30624 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 1.9 MB
None
    기온   풍속  습도      기압   일조  일사   전운량  발전량
0 -3.6  1.9  66  1027.7  0.0  0.0    9  0.0
1 -4.4  2.3  66  1027.6  0.0  0.0    6  0.0
2 -4.1  2.1  65  1028.1  0.0  0.0    9  0.0
3 -4.0  1.1  63  1028.0  0.0  0.0    9  0.0
4 -4.2  1.3  63  1027.5  0.0  0.0    9  0.0


In [None]:
data = np.array(data)

# 정규화
data = (data - np.min(data, 0)) / (np.max(data, 0) - np.min(data, 0))
print(data.shape)
print(data[0 : 5])
print(len(data) * 0.8)


(30624, 8)
[[0.1991342  0.08920188 0.61363636 0.89668616 0.         0.
  0.9        0.        ]
 [0.18181818 0.10798122 0.61363636 0.89473684 0.         0.
  0.6        0.        ]
 [0.18831169 0.09859155 0.60227273 0.90448343 0.         0.
  0.9        0.        ]
 [0.19047619 0.05164319 0.57954545 0.90253411 0.         0.
  0.9        0.        ]
 [0.18614719 0.06103286 0.57954545 0.89278752 0.         0.
  0.9        0.        ]]
24499.2


In [None]:
# 배열[행 , 열]로 인덱싱한다
x_data = data[ : -1, : ]
# -1 번째 행(1시간 후 발전량이 없는 데이터) 제거

y_data = data[1 : , -1]
# 2번째 행부터 마지막 열만 인덱싱(1시간 후 발전량부터 시작)

print(x_data.shape)
print(y_data.shape)

(30623, 8)
(30623,)


In [None]:
# 데이터 셋 만들기
x_train = x_data[ : 24499 ,  : ]
y_train = y_data[ : 24499]

x_test = x_data[24499 : ,  : ]
y_test = y_data[24499 :]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(24499, 8)
(24499,)
(6124, 8)
(6124,)


## 문제 1 : 다중 회귀

### 선형 회귀

In [47]:
lr = LinearRegression()
lr.fit(x_train , y_train)

print(f"Train score is {lr.score(x_train , y_train)}")
print(f"Test Score is {lr.score(x_test , y_test)}")

Train score is 0.8794741049746468
Test Score is 0.8716889678746736


In [48]:
# 테스트 세트
test_predict = lr.predict(x_test)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5

print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr.predict(x_train)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.05590270487128472
mse for test set is 0.008161603057721883
rmse for test set is 0.0903415909629772


mae for train set is 0.04488559465106675
mse for train set is 0.00619308875572293
rmse for train set is 0.07869618005801127


##특성 공학

### degree = 2

In [49]:
# degree = 2

poly = PolynomialFeatures(include_bias = False) # 특성 변환기 객체

poly.fit(x_train)

train_poly = poly.transform(x_train)
test_poly = poly.transform(x_test)

poly.get_feature_names_out() # 변환 특성 확인
print(train_poly.shape)
print(test_poly.shape)


(24499, 44)
(6124, 44)


In [50]:
lr1 = LinearRegression()
lr1.fit(train_poly , y_train)
print(f"train score : {lr1.score(train_poly , y_train)}")
print(f"test score : {lr1.score(test_poly , y_test)}")

train score : 0.8969992717449276
test score : 0.8927687236177932


In [51]:
# 테스트 세트
test_predict = lr1.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr1.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.04777431921468522
mse for test set is 0.006820762787954337
rmse for test set is 0.0825879094538319


mae for train set is 0.03987738121357117
mse for train set is 0.005292577597980739
rmse for train set is 0.07275010376611664


### degree = 5

In [52]:
# degree 5
poly = PolynomialFeatures(degree = 5 , include_bias = False) # 특성 변환기 객체

poly.fit(x_train)

train_poly = poly.transform(x_train)
test_poly = poly.transform(x_test)

poly.get_feature_names_out() # 변환 특성 확인
print(train_poly.shape)
print(test_poly.shape)


(24499, 1286)
(6124, 1286)


In [53]:
lr = LinearRegression()
lr.fit(train_poly , y_train)
print(f"train score : {lr.score(train_poly , y_train)}")
print(f"test score : {lr.score(test_poly , y_test)}")

train score : 0.9247809189180961
test score : 0.8586134747887589


In [54]:
# 테스트 세트
test_predict = lr.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.04646922291971118
mse for test set is 0.008993308504896436
rmse for test set is 0.09483305597151467


mae for train set is 0.03183356877724959
mse for train set is 0.003865048628480701
rmse for train set is 0.06216951526657338


## 릿지

In [55]:
ridge = Ridge()
ridge.fit(train_poly , y_train)

print(f"train score : {ridge.score(train_poly , y_train)}")
print(f"test score : {ridge.score(test_poly , y_test)}")

train score : 0.9076175478929601
test score : 0.9039611136363551


In [56]:
# 테스트 세트
test_predict = lr.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.04646922291971118
mse for test set is 0.008993308504896436
rmse for test set is 0.09483305597151467


mae for train set is 0.03183356877724959
mse for train set is 0.003865048628480701
rmse for train set is 0.06216951526657338


In [57]:
train_predict = lr.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(mae)
print(mse)
print(rmse)

0.03183356877724959
0.003865048628480701
0.06216951526657338


라쏘

In [58]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.0001)
lasso.fit(train_poly , y_train)
print(f"train score = {lasso.score(train_poly , y_train)}")
print(f"test score = {lasso.score(test_poly , y_test)}")

train score = 0.8916799414500155
test score = 0.8868284107134496


In [59]:
# 테스트 세트
test_predict = lasso.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lasso.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.04897836177003491
mse for test set is 0.007198613976280532
rmse for test set is 0.08484464612620252


mae for train set is 0.04030819432714047
mse for train set is 0.0055659054552886335
rmse for train set is 0.07460499618181501


# 수원 데이터

In [60]:
data = pd.read_csv('/content/drive/MyDrive/suwon_data.csv', encoding='cp949')
data = data.fillna(0)
print(data.info())                     # 데이터 셋 정보
print(data.head())                   # 처음 5개 샘플 보기

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30624 entries, 0 to 30623
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   기온      30624 non-null  float64
 1   풍속      30624 non-null  float64
 2   습도      30624 non-null  float64
 3   기압      30624 non-null  float64
 4   일조      30624 non-null  float64
 5   일사      30624 non-null  float64
 6   전운량     30624 non-null  float64
 7   발전량     30624 non-null  float64
dtypes: float64(8)
memory usage: 1.9 MB
None
    기온   풍속    습도      기압   일조  일사   전운량  발전량
0 -6.3  2.4  74.0  1028.6  0.0  0.0  9.0  0.0
1 -6.2  1.6  70.0  1028.2  0.0  0.0  6.0  0.0
2 -5.8  1.6  65.0  1028.3  0.0  0.0  7.0  0.0
3 -5.4  1.3  64.0  1028.3  0.0  0.0  9.0  0.0
4 -5.1  1.0  67.0  1027.6  0.0  0.0  9.0  0.0


In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
data = np.array(data)

data = (data - np.min(data, 0)) / (np.max(data, 0) - np.min(data, 0))
print(data.shape)
print(data[0 : 5])
print(len(data) * 0.8)


(30624, 8)
[[0.21978022 0.24742268 0.70454545 0.99516254 0.         0.
  0.9        0.        ]
 [0.22161172 0.16494845 0.65909091 0.99477554 0.         0.
  0.6        0.        ]
 [0.22893773 0.16494845 0.60227273 0.99487229 0.         0.
  0.7        0.        ]
 [0.23626374 0.13402062 0.59090909 0.99487229 0.         0.
  0.9        0.        ]
 [0.24175824 0.10309278 0.625      0.99419505 0.         0.
  0.9        0.        ]]
24499.2


In [63]:
x_data = data[ : -1, : ]
y_data = data[1 : , -1]
print(x_data.shape)
print(y_data.shape)


(30623, 8)
(30623,)


In [64]:
# 데이터 셋 만들기
x_train = x_data[ : 24499 ,  : ]
y_train = y_data[ : 24499]

x_test = x_data[24499 : ,  : ]
y_test = y_data[24499 :]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(24499, 8)
(24499,)
(6124, 8)
(6124,)


## 다중 회귀

In [65]:
lr = LinearRegression()
lr.fit(x_train , y_train)

print(f"Train score is {lr.score(x_train , y_train)}")
print(f"Test Score is {lr.score(x_test , y_test)}")

Train score is 0.8551717538717083
Test Score is 0.8436807312209164


In [66]:
# 테스트 세트
test_predict = lr.predict(x_test)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr.predict(x_train)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.05086376630546506
mse for test set is 0.007012839647720558
rmse for test set is 0.08374269907114625


mae for train set is 0.04260189930757277
mse for train set is 0.006052628816752102
rmse for train set is 0.07779864276934464


## 특성 공학

### degree = 2

In [67]:
# degree = 2

poly = PolynomialFeatures(include_bias = False) # 특성 변환기 객체 , default degree = 2

poly.fit(x_train)

train_poly = poly.transform(x_train)
test_poly = poly.transform(x_test)

poly.get_feature_names_out() # 변환 특성 확인
print(train_poly.shape)
print(test_poly.shape)


(24499, 44)
(6124, 44)


In [68]:
lr1 = LinearRegression()
lr1.fit(train_poly , y_train)
print(f"train score : {lr1.score(train_poly , y_train)}")
print(f"test score : {lr1.score(test_poly , y_test)}")

train score : 0.867903819038744
test score : 0.8583150229104903


In [69]:
# 테스트 세트
test_predict = lr1.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr1.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.044170426587809175
mse for test set is 0.006356311877481371
rmse for test set is 0.0797264816574855


mae for train set is 0.03787475927461138
mse for train set is 0.005520533271947239
rmse for train set is 0.07430029119692089


### degree = 5

In [70]:
# degree 5
poly = PolynomialFeatures(degree = 5 , include_bias = False) # 특성 변환기 객체

poly.fit(x_train)

train_poly = poly.transform(x_train)
test_poly = poly.transform(x_test)

poly.get_feature_names_out() # 변환 특성 확인
print(train_poly.shape)
print(test_poly.shape)


(24499, 1286)
(6124, 1286)


In [71]:
lr = LinearRegression()
lr.fit(train_poly , y_train)
print(f"train score : {lr.score(train_poly , y_train)}")
print(f"test score : {lr.score(test_poly , y_test)}")

train score : 0.9029775669690199
test score : 0.8004826239523307


In [72]:
# 테스트 세트
test_predict = lr.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.0463675673226442
mse for test set is 0.008950805464255633
rmse for test set is 0.09460869655721736


mae for train set is 0.031121974065586488
mse for train set is 0.004054739249652462
rmse for train set is 0.06367683448203484


## 릿지

In [73]:
ridge = Ridge()
ridge.fit(train_poly , y_train)

print(f"train score : {ridge.score(train_poly , y_train)}")
print(f"test score : {ridge.score(test_poly , y_test)}")

train score : 0.8838487560559837
test score : 0.8688771226131586


In [74]:
# 테스트 세트
test_predict = lr.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lr.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.0463675673226442
mse for test set is 0.008950805464255633
rmse for test set is 0.09460869655721736


mae for train set is 0.031121974065586488
mse for train set is 0.004054739249652462
rmse for train set is 0.06367683448203484


In [75]:
train_predict = lr.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(mae)
print(mse)
print(rmse)

0.031121974065586488
0.004054739249652462
0.06367683448203484


## 라쏘

In [76]:
lasso = Lasso(alpha = 0.0001)
lasso.fit(train_poly , y_train)
print(f"train score = {lasso.score(train_poly , y_train)}")
print(f"test score = {lasso.score(test_poly , y_test)}")

train score = 0.8674020015208145
test score = 0.8572934601151303


  model = cd_fast.enet_coordinate_descent(


In [77]:
# 테스트 세트
test_predict = lasso.predict(test_poly)

mae = mean_absolute_error(y_test , test_predict)
mse = mean_squared_error(y_test , test_predict)
rmse = mse ** 0.5
print(f"mae for test set is {mae}")
print(f"mse for test set is {mse}")
print(f"rmse for test set is {rmse}")

print("\n")
# 훈련 세트
train_predict = lasso.predict(train_poly)

mae = mean_absolute_error(y_train , train_predict)
mse = mean_squared_error(y_train , train_predict)
rmse = mse ** 0.5

print(f"mae for train set is {mae}")
print(f"mse for train set is {mse}")
print(f"rmse for train set is {rmse}")

mae for test set is 0.044414170107962024
mse for test set is 0.006402141519149291
rmse for test set is 0.08001338337521599


mae for train set is 0.03725224487651596
mse for train set is 0.0055415051144639335
rmse for train set is 0.07444128635685934
