<h3>Tensor Flow를 활용한 데이터 회귀 분석</h3>

텐서플로우(TensorFlow)를 사용하여 기후 데이터를 기반으로 뉴스 수를 예측하는 코드

In [1]:
# # 필요 패키지 목록 정의
# packages = [
#     "numpy",
#     "pandas",
#     "tensorflow",
#     "scikit-learn",
#     "sklearn"
# ]

In [2]:
# # 필요 패키지 설치
# import subprocess
# import sys

# def check_and_install_packages(package_list):
#     for package in package_list:
#         package_name = package.split("==")[0]
#         try:
#             __import__(package_name)
#             print(f"✔ '{package}' is already installed.")
#         except ImportError:
#             print(f"✘ '{package}' is not installed. Installing now...")
#             try:
#                 subprocess.check_call([sys.executable, "-m", "pip", "install", package])
#                 print(f"✔ '{package}' installed successfully.")
#             except subprocess.CalledProcessError:
#                 print(f"✘ Failed to install '{package}'. Please install it manually.")

# check_and_install_packages(packages)

In [3]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import LogCosh
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
# 데이터 로드
# 뉴스 데이터 로드
datasPath = os.path.join(os.getcwd(), "..", "data_preprocessing", "merged_final_data.csv")
print(f"file path : {datasPath}")
data = pd.read_csv(datasPath, encoding='utf-8')

# Week_Num 속성 제거
data = data.drop(columns=['Week_Num'])

data.head()
data_vars_list = list(data.columns)

print(data_vars_list)

file path : C:\Users\limul\Desktop\College\2_grade\2_semester\computational_statistics\teamProject\2024-2-CSP\AI\made_by_LJW\..\data_preprocessing\merged_final_data.csv
['datetime', 'Article_Num', 'Polution', 'Enviroment_Polution', 'Biodiversity_Loss', 'Acid_Rain', 'Water_Pollution', 'Climate_Crisis', 'Accelerated_Global_Warming', 'Ozone_Layer_Depletion', 'Hazardous_Substance_Leakage', 'Carbon_Dioxide', 'Weekly_News_Count', 'News_Ratio', 'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'windspeed', 'sealevelpressure', 'moonphase']


In [5]:
# 입력과 출력 컬럼 정의
X_columns = ['datetime', 'Article_Num', 'Polution', 'Enviroment_Polution', 'Biodiversity_Loss',
             'Acid_Rain', 'Water_Pollution', 'Climate_Crisis', 'Accelerated_Global_Warming',
             'Ozone_Layer_Depletion', 'Hazardous_Substance_Leakage', 'Carbon_Dioxide',
             'Weekly_News_Count', 'News_Ratio']
y_columns = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 
             'windspeed', 'sealevelpressure', 'moonphase']


# String data인 datetime을 Unix Timestamp로 변환
data['datetime'] = pd.to_datetime(data['datetime'])  # 날짜를 datetime 객체로 변환
data['datetime'] = data['datetime'].map(pd.Timestamp.timestamp)  # Unix Timestamp로 변환

# # datetime을 제외하고 데이터 스케일링
# datetime_tmp = data['datetime']
# data = data.drop(columns='datetime')
# scaler = MinMaxScaler()
# scaled_data = scaler.fit_transform(data)
# scaled_data = pd.DataFrame(scaled_data, columns=data.columns)
# data = pd.concat([datetime_tmp, scaled_data], axis=1)
print(data.head())

# 데이터 분리
X = data[X_columns]
y = data[y_columns]

# 훈련 및 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 스케일링
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)


print("X_train")
print(X_train[:5])
print()
print("y_train")
print(y_train[:5])

      datetime  Article_Num  Polution  Enviroment_Polution  Biodiversity_Loss  \
0  946771200.0           10        12                    0                  0   
1  947376000.0           28        33                    0                  0   
2  947980800.0           16        14                    0                  0   
3  948585600.0           19        24                    0                  0   
4  949190400.0           10        16                    1                  0   

   Acid_Rain  Water_Pollution  Climate_Crisis  Accelerated_Global_Warming  \
0          5                0               0                           0   
1          4                0               0                           0   
2          1                0               0                           0   
3          2                0               0                           0   
4          3                0               0                           0   

   Ozone_Layer_Depletion  ...  News_Ratio    tempm

In [6]:
# 학습 과정 정의
epoch = 1000
lr = 0.0001

# 사용자 정의 옵티마이저와 손실 함수
# 손실 함수는 이상치에 대해 Log-Cosh Loss함수 사용
# MAE와 유사하지만, 큰 오차에 대해 더 부드럽게 처리 가능
optimizer = Adam(learning_rate=lr)  # Adam 옵티마이저, 학습률 0.001
loss_function = LogCosh()

# 모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    Dropout(0.2),
    tf.keras.layers.Dense(len(y_columns))  # 출력 노드 수는 y_columns의 수와 동일
])

# 모델 컴파일
model.compile(optimizer=optimizer, loss=loss_function, metrics=['mae'])

In [7]:
# 모델 학습
history = model.fit(X_train, y_train, epochs=epoch, batch_size=64, validation_split=0.2, verbose=1)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [8]:
# 모델 평가
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")

# 예측 수행
y_pred = model.predict(X_test)

# 출력 데이터(y_columns)에 대한 스케일러를 사용해 역변환
y_test_inverse = y_scaler.inverse_transform(y_test)  # 실제 값 복원
y_pred_inverse = y_scaler.inverse_transform(y_pred)  # 예측 값 복원

# 예측 결과 출력
print("\nSample Predictions:")
for i in range(5):  # 처음 5개의 샘플에 대해 출력
    true_values = y_test_inverse[i]  # Numpy 배열에서 직접 값 가져오기
    predicted_values = y_pred_inverse[i]  # 예측 값
    print(f"Sample {i+1}:")
    for attr, true_val, pred_val in zip(y_columns, true_values, predicted_values):
        print(f"  {attr:<18} | True: {true_val} | Predicted: {pred_val}")
    print()

Test Loss: 0.0230, Test MAE: 0.1713

Sample Predictions:
Sample 1:
  tempmax            | True: 44.111428571428576 | Predicted: 72.03106689453125
  tempmin            | True: 27.32 | Predicted: 56.58017349243164
  temp               | True: 35.75428571428572 | Predicted: 63.81404495239258
  dew                | True: -6.6000000000000005 | Predicted: 26.908414840698242
  humidity           | True: 54.657142857142865 | Predicted: 66.57965850830078
  precip             | True: 0.006571428571428501 | Predicted: 2.0579066276550293
  windspeed          | True: 13.457142857142854 | Predicted: 12.278884887695312
  sealevelpressure   | True: 1024.1285714285716 | Predicted: 1012.8400268554688
  moonphase          | True: 0.4957142857142857 | Predicted: 0.4993639886379242

Sample 2:
  tempmax            | True: 83.87142857142858 | Predicted: 67.19985961914062
  tempmin            | True: 66.85714285714286 | Predicted: 49.782100677490234
  temp               | True: 75.28571428571429 | Predicted: 

In [9]:
# # 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # 스케일러 생성 및 y_train 기준으로 fit
# y_scaler = MinMaxScaler()
# y_train_scaled = y_scaler.fit_transform(y_train)  # y_train을 기준으로 스케일링

# # y_test도 동일한 스케일러로 transform
# y_test_scaled = y_scaler.transform(y_test)

# # y_test 복원 (역변환)
# y_test_restored = y_scaler.inverse_transform(y_test_scaled)

# # 결과 출력
# print("Original y_test:\n", y_test.head())
# print("\nScaled y_test:\n", y_test_scaled[:5])
# print("\nRestored y_test:\n", y_test_restored[:5])

Original y_test:
         tempmax    tempmin       temp        dew   humidity    precip  \
1208  44.111429  27.320000  35.754286  -6.600000  54.657143  0.006571   
868   83.871429  66.857143  75.285714  62.971429  67.585714  0.262857   
532   33.100000  16.571429  24.957143   5.871429  47.528571  0.000000   
344   80.671429  71.671429  75.971429  73.057143  91.214286  3.193571   
405   79.742857  65.285714  72.142857  66.442857  83.642857  0.721143   

      windspeed  sealevelpressure  moonphase  
1208  13.457143       1024.128571   0.495714  
868   12.457143       1004.971429   0.597143  
532   10.600000       1025.971429   0.665714  
344   15.185714       1006.728571   0.572857  
405   10.300000       1011.085714   0.195714  

Scaled y_test:
 [[2.84697373e-01 3.06382979e-01 2.88171635e-01 7.95996187e-02
  2.93580446e-01 1.41879409e-04 5.69633508e-01 7.37115634e-01
  5.00768049e-01]
 [8.14427103e-01 8.14013206e-01 8.13556104e-01 8.53352399e-01
  5.36663981e-01 5.67517635e-03 4.963350