In [747]:
import json
import pandas as pd
import numpy as np

# JSON 파일 로드
with open('data/wildfire_data.json') as wf:
    data = json.load(wf)

# 데이터 가져오기
x_data = np.array(data['U']['data'])  # 2D 배열
y_data = np.array(data['v']['data']).reshape(-1, 1)  # 1D → 2D 변환

# 데이터 결합
combined_data = np.concatenate((x_data, y_data), axis=1)  # 2D + 2D 결합

# DataFrame 생성
columns = ['X_locate', 'Y_locate', 'Month', 'FFMC', 'Temperature', 'Wind', 'Rain', 'Area']
df = pd.DataFrame(data=combined_data, columns=columns)

# 출력
print(df)


     X_locate  Y_locate  Month  FFMC  Temperature  Wind  Rain      Area
0         6.0       3.0    9.0  88.6         20.6   1.8   0.0  0.000000
1         6.0       5.0    8.0  95.2         27.4   4.0   0.0  0.641854
2         3.0       4.0   12.0  85.4          4.6   8.5   0.0  2.462150
3         7.0       4.0    8.0  90.2         19.5   5.8   0.0  0.000000
4         5.0       4.0    9.0  94.3         20.1   4.9   0.0  0.900161
..        ...       ...    ...   ...          ...   ...   ...       ...
512       8.0       6.0    6.0  91.2         19.6   4.9   0.0  0.000000
513       2.0       4.0    8.0  81.6         21.9   5.8   0.0  4.012592
514       6.0       5.0    9.0  91.9         21.1   2.7   0.0  1.894617
515       3.0       4.0    3.0  88.1         15.8   7.6   0.0  0.000000
516       4.0       3.0    9.0  91.6          9.8   1.8   0.0  0.000000

[517 rows x 8 columns]


In [748]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

train_set = pd.DataFrame(train_set, columns=columns)
test_set = pd.DataFrame(test_set, columns=columns)

train_set_x = train_set.drop(columns=train_set.columns[-1])  # 마지막 열 제외
test_set_x = test_set.drop(columns=test_set.columns[-1])
train_set_y = train_set.iloc[:, -1].values.reshape(-1, 1)
test_set_y = test_set.iloc[:, -1].values.reshape(-1, 1)

print(train_set_x.shape)
print(train_set_y.shape)
print(test_set_x.shape)
print(test_set_y.shape)

(413, 7)
(413, 1)
(104, 7)
(104, 1)


In [749]:
from sklearn.preprocessing import StandardScaler

cols_to_exclude = ['Month', 'Area']
cols_to_scale = [col for col in df.columns if col not in cols_to_exclude]

train_set_scaled = train_set_x.copy().reset_index(drop=True)
test_set_scaled = test_set_x.copy().reset_index(drop=True)

scaler = StandardScaler()
train_set_scaled[cols_to_scale] = scaler.fit_transform(train_set_x[cols_to_scale])
test_set_scaled[cols_to_scale] = scaler.transform(test_set_x[cols_to_scale])

print(train_set_scaled)
print(test_set_scaled)

     X_locate  Y_locate  Month       FFMC  Temperature      Wind      Rain
0   -0.274128  0.602856    1.0 -12.943816    -2.365455 -1.752069 -0.073328
1    1.454346  1.419183    8.0   0.261277     1.153681 -1.238348 -0.073328
2    1.454346  0.602856    8.0   0.441183     1.379044 -0.496307 -0.073328
3    0.590109  0.602856    9.0   0.045390     0.425584 -1.010027 -0.073328
4    0.157990 -0.213470    8.0   0.747023     0.252227 -0.724627 -0.073328
..        ...       ...    ...        ...          ...       ...       ...
408 -0.706247 -0.213470    9.0  -0.188487     0.858975 -0.210906 -0.073328
409 -0.274128 -0.213470    8.0   0.800995     1.708421  0.017414 -0.073328
410 -1.138366  0.602856    8.0   0.818986     0.928317 -0.724627 -0.073328
411 -1.570484 -0.213470    7.0   0.297259     0.789632  0.816536 -0.073328
412  0.590109 -1.029797    9.0   0.387212     0.009528  1.843977 -0.073328

[413 rows x 7 columns]
     X_locate  Y_locate  Month      FFMC  Temperature      Wind      Rain
0 

In [750]:
from sklearn.preprocessing import OneHotEncoder

month = list(range(1, 13))
column_names = [f'Month_{i}' for i in month]

encoder = OneHotEncoder(categories=[month], sparse_output=False)
train_month_encoded = encoder.fit_transform(train_set_scaled[['Month']])
test_month_encoded = encoder.transform(test_set_scaled[['Month']])

train_month_df = pd.DataFrame(train_month_encoded, columns=column_names)
test_month_df = pd.DataFrame(test_month_encoded, columns=column_names)

train_set_scaled_month = train_set_scaled.drop(columns=['Month'])
test_set_scaled_month = test_set_scaled.drop(columns=['Month'])

print(train_month_df)
print(test_month_df)

     Month_1  Month_2  Month_3  Month_4  Month_5  Month_6  Month_7  Month_8  \
0        1.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1        0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
2        0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
3        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
4        0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
..       ...      ...      ...      ...      ...      ...      ...      ...   
408      0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
409      0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
410      0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
411      0.0      0.0      0.0      0.0      0.0      0.0      1.0      0.0   
412      0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

     Month_9  Month_10  Month_11  Month_12  
0     

In [751]:
train_data_set_x = pd.concat([train_set_scaled_month, train_month_df], ignore_index=True, axis=1)
test_data_set_x = pd.concat([test_set_scaled_month, test_month_df], ignore_index=True, axis=1)

train_data_ones = np.ones(len(train_data_set_x))
test_data_ones = np.ones(len(test_data_set_x))

train_data_ones = pd.DataFrame(data=train_data_ones, columns=['bias'])
test_data_ones = pd.DataFrame(data=test_data_ones, columns=['bias'])

train_data_set = pd.concat([train_data_set_x, train_data_ones], ignore_index=True, axis=1)
test_data_set = pd.concat([test_data_set_x, test_data_ones], ignore_index=True, axis=1)

print(train_data_set)
print(test_data_set)

           0         1          2         3         4         5    6    7   \
0   -0.274128  0.602856 -12.943816 -2.365455 -1.752069 -0.073328  1.0  0.0   
1    1.454346  1.419183   0.261277  1.153681 -1.238348 -0.073328  0.0  0.0   
2    1.454346  0.602856   0.441183  1.379044 -0.496307 -0.073328  0.0  0.0   
3    0.590109  0.602856   0.045390  0.425584 -1.010027 -0.073328  0.0  0.0   
4    0.157990 -0.213470   0.747023  0.252227 -0.724627 -0.073328  0.0  0.0   
..        ...       ...        ...       ...       ...       ...  ...  ...   
408 -0.706247 -0.213470  -0.188487  0.858975 -0.210906 -0.073328  0.0  0.0   
409 -0.274128 -0.213470   0.800995  1.708421  0.017414 -0.073328  0.0  0.0   
410 -1.138366  0.602856   0.818986  0.928317 -0.724627 -0.073328  0.0  0.0   
411 -1.570484 -0.213470   0.297259  0.789632  0.816536 -0.073328  0.0  0.0   
412  0.590109 -1.029797   0.387212  0.009528  1.843977 -0.073328  0.0  0.0   

      8    9    10   11   12   13   14   15   16   17   18  
0 

In [752]:
def RidgeRegression(x, y, weight, learning_rate=0.001, lambdas=np.logspace(-1, 5, 100).reshape(1, -1), iterations=10):
    cost = []
    lambdas = np.repeat(lambdas, x.shape[1], axis=0)
    for i in range(iterations):
        y_hat = np.dot(x, weight)
        error = y_hat - y
        gradient = np.dot(x.T, error) + weight * lambdas

        weight -= learning_rate * gradient

        cost = np.sqrt(np.mean(error ** 2, axis=0) / x.shape[0])

        #print(f"Iteration: {i}, Error: {cost}")

    return cost, weight, lambdas

In [753]:
train_weight = np.ones((train_data_set.shape[1], 100))

train_errors, train_weight, train_lambdas = RidgeRegression(train_data_set, train_set_y, train_weight)
fin_lambda = train_lambdas[0][np.argmin(train_errors)]
fin_weight = train_weight[0][np.argmin(train_errors)]
print(f"가장 작은 에러값: {np.min(train_errors)}")
print(f"가장 fit한 lambda: {fin_lambda}")
print(f"가장 fit한 weight: {fin_weight}")

가장 작은 에러값: 0.06822482892918835
가장 fit한 lambda: 15.199110829529332
가장 fit한 weight: 0.10952311587189952


In [754]:
def test_cost_function(x, y, weight):
    y_hat = np.dot(x, weight)
    error = y_hat - y
    cost = np.sqrt((error ** 2).mean() / x.shape[0])
    return cost

In [755]:
test_cost = test_cost_function(test_data_set, test_set_y, fin_weight)
print(test_cost)

0.16591825392068343


In [756]:
def sinusoidal_embedding(month):
    x = np.sin(2 * np.pi * month / 12)
    y = np.cos(2 * np.pi * month / 12)

    x_df = x.to_frame(name='Month_x')
    y_df = y.to_frame(name='Month_y')

    return x_df, y_df

In [757]:
train_si_scaled_x, train_si_scaled_y = sinusoidal_embedding(train_set_scaled['Month'])
test_si_scaled_x, test_si_scaled_y = sinusoidal_embedding(test_set_scaled['Month'])

train_set_si = train_set_scaled.copy()
test_set_si = test_set_scaled.copy()

month_pos = df.columns.get_loc('Month')

train_set_si.insert(month_pos, 'Month_x', train_si_scaled_x)
train_set_si.insert(month_pos + 1, 'Month_y', train_si_scaled_y)
test_set_si.insert(month_pos, 'Month_x', test_si_scaled_x)
test_set_si.insert(month_pos + 1, 'Month_y', test_si_scaled_y)

train_set_si.drop(['Month'], axis=1, inplace=True)
test_set_si.drop(['Month'], axis=1, inplace=True)

train_set_ones = pd.DataFrame(np.ones(len(train_set_si)), columns=['Bias'])
test_set_ones = pd.DataFrame(np.ones(len(test_set_si)), columns=['Bias'])

train_set_si = pd.concat([train_set_si, train_set_ones], axis=1)
test_set_si = pd.concat([test_set_si, test_set_ones], axis=1)

print(train_set_si)
print(test_set_si)

     X_locate  Y_locate   Month_x       Month_y       FFMC  Temperature  \
0   -0.274128  0.602856  0.500000  8.660254e-01 -12.943816    -2.365455   
1    1.454346  1.419183 -0.866025 -5.000000e-01   0.261277     1.153681   
2    1.454346  0.602856 -0.866025 -5.000000e-01   0.441183     1.379044   
3    0.590109  0.602856 -1.000000 -1.836970e-16   0.045390     0.425584   
4    0.157990 -0.213470 -0.866025 -5.000000e-01   0.747023     0.252227   
..        ...       ...       ...           ...        ...          ...   
408 -0.706247 -0.213470 -1.000000 -1.836970e-16  -0.188487     0.858975   
409 -0.274128 -0.213470 -0.866025 -5.000000e-01   0.800995     1.708421   
410 -1.138366  0.602856 -0.866025 -5.000000e-01   0.818986     0.928317   
411 -1.570484 -0.213470 -0.500000 -8.660254e-01   0.297259     0.789632   
412  0.590109 -1.029797 -1.000000 -1.836970e-16   0.387212     0.009528   

         Wind      Rain  Bias  
0   -1.752069 -0.073328   1.0  
1   -1.238348 -0.073328   1.0  
2  

In [758]:
train_si_weight = np.ones((train_set_si.shape[1], 100))
test_si_weight = np.ones((test_set_si.shape[1], 100))

train_errors, train_weight, train_lambdas = RidgeRegression(train_data_set, train_set_y, train_weight)
fin_lambda = train_lambdas[0][np.argmin(train_errors)]
fin_weight = train_weight[0][np.argmin(train_errors)]
print(f"가장 작은 에러값: {np.min(train_errors)}")
print(f"가장 fit한 lambda: {fin_lambda}")
print(f"가장 fit한 weight: {fin_weight}")

가장 작은 에러값: 0.06804206996336985
가장 fit한 lambda: 3.7649358067924674
가장 fit한 weight: 0.12625506095359504


In [759]:
test_cost = test_cost_function(test_data_set, test_set_y, fin_weight)
print(test_cost)

0.16582510908493872
