In [146]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Flatten, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 한글 깨짐 방지 
plt.rcParams['font.family'] = 'NanumGothic' 

In [147]:
'''
다 함께 하는 작업 공간이기 때문에 경로나 파일 명 등을 통일 해야 할 것 같습니다.
그렇게 하지 않으면 경로 때문에 매번 새로운 커밋이 생성 됩니다.
'''



# 현재 작업 디렉토리 확인
current_directory = os.getcwd()
print("Current directory:", current_directory)

# 필요한 CSV 파일이 있는 디렉토리로 변경
target_directory = "C:/k-project/raw_datasets/국가" # 해당 코드 부분의 경로를 동일하게 하거나 / 깃 리파지토리 안에 파일을 집어 넣는 작업 필요.      
os.chdir(target_directory)

# CSV 파일 불러오기
csv_filename = "World_Data.csv"  # CSV 파일명. 파일명을 모두 동일하게 해야합니다.
df = pd.read_csv(csv_filename)


df.head()

Current directory: C:\k-project\raw_datasets\국가


Unnamed: 0,COUNTRY,Composite_Indicators,1980,1981,1982,1983,1984,1985,1986,1987,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,World,Biomass and waste (billion kWh),17.725897,17.888914,27.192153,28.180857,31.165802,31.65251,34.259224,38.151013,...,407.145946,445.637929,479.654452,507.591876,547.176101,582.604904,601.204395,632.151593,637.50517,639.96902
1,World,CO2 emissions (MMtonnes CO2),18719.16226,18345.18264,18298.31656,18486.86527,19614.01552,20038.50142,20520.66734,21169.27187,...,34397.05219,34819.96544,34869.28878,34810.52097,34630.11177,35003.04482,35617.66427,35655.03069,33679.78865,35462.72411
2,World,Coal (quad Btu),78.694561,79.018341,80.468585,82.629641,86.399083,89.446988,90.460451,94.053384,...,171.364294,173.359051,172.359968,167.54428,163.234002,164.109245,165.807331,164.191431,158.756977,166.720467
3,World,Coal and coke (MMtonnes CO2),7491.61815,7518.337871,7651.370707,7855.808639,8212.132553,8503.411904,8601.810772,8936.353445,...,16274.36294,16460.02908,16363.96638,15989.43298,15580.29052,15628.00181,15827.46674,15741.55202,15251.92078,15966.05733
4,World,Consumed natural gas (MMtonnes CO2),2842.337165,2843.874298,2854.973114,2913.476944,3214.469613,3349.5177,3395.116049,3571.676242,...,6662.05692,6759.120316,6783.647503,6883.268117,7023.602021,7241.122517,7601.449578,7726.427466,7577.037734,7948.649507


In [148]:
df['Composite_Indicators'].unique()

array(['Biomass and waste (billion kWh)', 'CO2 emissions (MMtonnes CO2)',
       'Coal (quad Btu)', 'Coal and coke (MMtonnes CO2)',
       'Consumed natural gas (MMtonnes CO2)', 'Consumption (quad Btu)',
       'Fossil fuels (billion kWh)', 'GDP', 'Generation (billion kWh)',
       'Geothermal (billion kWh)',
       'Hydroelectric pumped storage (billion kWh)',
       'Hydroelectricity (billion kWh)', 'Natural gas (quad Btu)',
       'Non-hydroelectric renewables (billion kWh)',
       'Nuclear (billion kWh)', 'Nuclear (quad Btu)',
       'Nuclear, renewables, and other (quad Btu)',
       'Petroleum and other liquids (MMtonnes CO2)',
       'Petroleum and other liquids (quad Btu)', 'Population',
       'Renewables (billion kWh)', 'Renewables and other (quad Btu)',
       'Solar (billion kWh)',
       'Solar, tide, wave, fuel cell (billion kWh)',
       'Tide and wave (billion kWh)', 'Wind (billion kWh)'], dtype=object)

In [149]:
seq = df[df['Composite_Indicators'].values == 'CO2 emissions (MMtonnes CO2)']


In [150]:
seq.columns

Index(['COUNTRY', 'Composite_Indicators', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
       '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021'],
      dtype='object')

In [151]:
seq = seq.drop(['Composite_Indicators', 'COUNTRY'], axis=1)
seq

Unnamed: 0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
1,18719.16226,18345.18264,18298.31656,18486.86527,19614.01552,20038.50142,20520.66734,21169.27187,21846.14316,22164.92996,...,34397.05219,34819.96544,34869.28878,34810.52097,34630.11177,35003.04482,35617.66427,35655.03069,33679.78865,35462.72411


In [152]:
seq = seq.T
seq 

Unnamed: 0,1
1980,18719.16226
1981,18345.18264
1982,18298.31656
1983,18486.86527
1984,19614.01552
1985,20038.50142
1986,20520.66734
1987,21169.27187
1988,21846.14316
1989,22164.92996


In [153]:
seq.columns

Int64Index([1], dtype='int64')

In [154]:
# 시퀀스 분할 함수 정의
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = [], []
    for i in range(len(sequences)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        if out_end_ix > len(sequences):
            break
        
        seq_x, seq_y = sequences[i:end_ix], sequences[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

n_steps_in, n_steps_out = 3, 1
X, y = split_sequences(seq, n_steps_in, n_steps_out)

In [155]:
print(X)

[[[18719.16226]
  [18345.18264]
  [18298.31656]]

 [[18345.18264]
  [18298.31656]
  [18486.86527]]

 [[18298.31656]
  [18486.86527]
  [19614.01552]]

 [[18486.86527]
  [19614.01552]
  [20038.50142]]

 [[19614.01552]
  [20038.50142]
  [20520.66734]]

 [[20038.50142]
  [20520.66734]
  [21169.27187]]

 [[20520.66734]
  [21169.27187]
  [21846.14316]]

 [[21169.27187]
  [21846.14316]
  [22164.92996]]

 [[21846.14316]
  [22164.92996]
  [22145.41295]]

 [[22164.92996]
  [22145.41295]
  [21970.53907]]

 [[22145.41295]
  [21970.53907]
  [21756.10964]]

 [[21970.53907]
  [21756.10964]
  [21883.66645]]

 [[21756.10964]
  [21883.66645]
  [22052.3184 ]]

 [[21883.66645]
  [22052.3184 ]
  [22542.116  ]]

 [[22052.3184 ]
  [22542.116  ]
  [23034.64935]]

 [[22542.116  ]
  [23034.64935]
  [23103.44865]]

 [[23034.64935]
  [23103.44865]
  [23150.37062]]

 [[23103.44865]
  [23150.37062]
  [23492.72702]]

 [[23150.37062]
  [23492.72702]
  [24249.34033]]

 [[23492.72702]
  [24249.34033]
  [24460.77055]]



In [156]:
print(y)

[[[18486.86527]]

 [[19614.01552]]

 [[20038.50142]]

 [[20520.66734]]

 [[21169.27187]]

 [[21846.14316]]

 [[22164.92996]]

 [[22145.41295]]

 [[21970.53907]]

 [[21756.10964]]

 [[21883.66645]]

 [[22052.3184 ]]

 [[22542.116  ]]

 [[23034.64935]]

 [[23103.44865]]

 [[23150.37062]]

 [[23492.72702]]

 [[24249.34033]]

 [[24460.77055]]

 [[25004.8652 ]]

 [[26148.25371]]

 [[27588.74155]]

 [[28674.60336]]

 [[29620.55968]]

 [[30070.39408]]

 [[30778.78774]]

 [[30600.03669]]

 [[32501.91555]]

 [[33612.62686]]

 [[34397.05219]]

 [[34819.96544]]

 [[34869.28878]]

 [[34810.52097]]

 [[34630.11177]]

 [[35003.04482]]

 [[35617.66427]]

 [[35655.03069]]

 [[33679.78865]]

 [[35462.72411]]]


In [157]:
# 데이터 분할 및 LSTM 입력 형태에 맞게 변환
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# LSTM 모델 생성 및 학습
model = Sequential()
model.add(LSTM(200, activation='relu', return_sequences=True, input_shape=(n_steps_in, 1))) # 레이어 크기가 100인 첫 번째 LSTM 레이어를 추가하십시오.
model.add(Dropout(0.2)) # 20 % 드롭아웃 레이어를 추가합니다.
model.add(LSTM(200, activation='relu'))

model.add(Dense(n_steps_out))
model.compile(optimizer='adam', loss='mse')

# # 조기 종료 사용
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 검증 데이터를 사용하여 모델 학습
model.fit(X_train, y_train, epochs=500, verbose=0, validation_split=0.1)


<keras.callbacks.History at 0x25caa2e5d08>

In [158]:
# 예측 및 결과 평가
y_pred = model.predict(X_test)

