In [398]:
import json
import pandas as pd
import numpy as np

# JSON 파일 로드
with open('data/wildfire_data.json') as wf:
    data = json.load(wf)

# 데이터 가져오기
x_data = np.array(data['U']['data'])  # 2D 배열
y_data = np.array(data['v']['data']).reshape(-1, 1)  # 1D → 2D 변환

# 데이터 결합
combined_data = np.concatenate((x_data, y_data), axis=1)  # 2D + 2D 결합

# DataFrame 생성
columns = ['X_locate', 'Y_locate', 'Month', 'FFMC', 'Temperature', 'Wind', 'Rain', 'Area']
df = pd.DataFrame(data=combined_data, columns=columns)

# 출력
print(df)


     X_locate  Y_locate  Month  FFMC  Temperature  Wind  Rain      Area
0         6.0       3.0    9.0  88.6         20.6   1.8   0.0  0.000000
1         6.0       5.0    8.0  95.2         27.4   4.0   0.0  0.641854
2         3.0       4.0   12.0  85.4          4.6   8.5   0.0  2.462150
3         7.0       4.0    8.0  90.2         19.5   5.8   0.0  0.000000
4         5.0       4.0    9.0  94.3         20.1   4.9   0.0  0.900161
..        ...       ...    ...   ...          ...   ...   ...       ...
512       8.0       6.0    6.0  91.2         19.6   4.9   0.0  0.000000
513       2.0       4.0    8.0  81.6         21.9   5.8   0.0  4.012592
514       6.0       5.0    9.0  91.9         21.1   2.7   0.0  1.894617
515       3.0       4.0    3.0  88.1         15.8   7.6   0.0  0.000000
516       4.0       3.0    9.0  91.6          9.8   1.8   0.0  0.000000

[517 rows x 8 columns]


In [399]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

train_set = pd.DataFrame(train_set, columns=columns).reset_index(drop=True)
test_set = pd.DataFrame(test_set, columns=columns).reset_index(drop=True)

#feature와 y 분리
train_set_x = train_set.drop(columns=train_set.columns[-1])
test_set_x = test_set.drop(columns=test_set.columns[-1])
train_set_y = train_set.iloc[:, -1].values.reshape(-1, 1)
test_set_y = test_set.iloc[:, -1].values.reshape(-1, 1)

print(train_set_x.shape)
print(train_set_y.shape)
print(test_set_x.shape)
print(test_set_y.shape)

(413, 7)
(413, 1)
(104, 7)
(104, 1)


In [400]:
from sklearn.preprocessing import StandardScaler

cols_to_exclude = ['Month', 'Area']
cols_to_scale = [col for col in df.columns if col not in cols_to_exclude]

train_set_scaled = train_set_x.copy()
test_set_scaled = test_set_x.copy()

scaler = StandardScaler()
train_set_scaled[cols_to_scale] = scaler.fit_transform(train_set_x[cols_to_scale])
test_set_scaled[cols_to_scale] = scaler.transform(test_set_x[cols_to_scale])

print(train_set_scaled)
print(test_set_scaled)

     X_locate  Y_locate  Month       FFMC  Temperature      Wind      Rain
0   -0.274128  0.602856    1.0 -12.943816    -2.365455 -1.752069 -0.073328
1    1.454346  1.419183    8.0   0.261277     1.153681 -1.238348 -0.073328
2    1.454346  0.602856    8.0   0.441183     1.379044 -0.496307 -0.073328
3    0.590109  0.602856    9.0   0.045390     0.425584 -1.010027 -0.073328
4    0.157990 -0.213470    8.0   0.747023     0.252227 -0.724627 -0.073328
..        ...       ...    ...        ...          ...       ...       ...
408 -0.706247 -0.213470    9.0  -0.188487     0.858975 -0.210906 -0.073328
409 -0.274128 -0.213470    8.0   0.800995     1.708421  0.017414 -0.073328
410 -1.138366  0.602856    8.0   0.818986     0.928317 -0.724627 -0.073328
411 -1.570484 -0.213470    7.0   0.297259     0.789632  0.816536 -0.073328
412  0.590109 -1.029797    9.0   0.387212     0.009528  1.843977 -0.073328

[413 rows x 7 columns]
     X_locate  Y_locate  Month      FFMC  Temperature      Wind      Rain
0 

In [401]:
from sklearn.preprocessing import OneHotEncoder

month = list(range(1, 13))
column_names = [f'Month_{i}' for i in month]

encoder = OneHotEncoder(categories=[month], sparse_output=False)
train_month_encoded = encoder.fit_transform(train_set_scaled[['Month']])
test_month_encoded = encoder.transform(test_set_scaled[['Month']])

train_month_df = pd.DataFrame(train_month_encoded, columns=column_names)
test_month_df = pd.DataFrame(test_month_encoded, columns=column_names)

print(train_month_df)
print(test_month_df)

     Month_1  Month_2  Month_3  Month_4  Month_5  Month_6  Month_7  Month_8  \
0        1.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1        0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
2        0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
3        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
4        0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
..       ...      ...      ...      ...      ...      ...      ...      ...   
408      0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
409      0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
410      0.0      0.0      0.0      0.0      0.0      0.0      0.0      1.0   
411      0.0      0.0      0.0      0.0      0.0      0.0      1.0      0.0   
412      0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

     Month_9  Month_10  Month_11  Month_12  
0     

In [402]:
#Month 제외
train_set_scaled_month = train_set_scaled.drop(columns=['Month'])
test_set_scaled_month = test_set_scaled.drop(columns=['Month'])

train_data_set_x = pd.concat([train_set_scaled_month, train_month_df], axis=1)
test_data_set_x = pd.concat([test_set_scaled_month, test_month_df], axis=1)

train_data_ones = np.ones(len(train_data_set_x))
test_data_ones = np.ones(len(test_data_set_x))

train_data_ones = pd.DataFrame(data=train_data_ones, columns=['bias'])
test_data_ones = pd.DataFrame(data=test_data_ones, columns=['bias'])

train_data_set = pd.concat([train_data_set_x, train_data_ones], axis=1)
test_data_set = pd.concat([test_data_set_x, test_data_ones], axis=1)

print(train_data_set)
print(test_data_set)

     X_locate  Y_locate       FFMC  Temperature      Wind      Rain  Month_1  \
0   -0.274128  0.602856 -12.943816    -2.365455 -1.752069 -0.073328      1.0   
1    1.454346  1.419183   0.261277     1.153681 -1.238348 -0.073328      0.0   
2    1.454346  0.602856   0.441183     1.379044 -0.496307 -0.073328      0.0   
3    0.590109  0.602856   0.045390     0.425584 -1.010027 -0.073328      0.0   
4    0.157990 -0.213470   0.747023     0.252227 -0.724627 -0.073328      0.0   
..        ...       ...        ...          ...       ...       ...      ...   
408 -0.706247 -0.213470  -0.188487     0.858975 -0.210906 -0.073328      0.0   
409 -0.274128 -0.213470   0.800995     1.708421  0.017414 -0.073328      0.0   
410 -1.138366  0.602856   0.818986     0.928317 -0.724627 -0.073328      0.0   
411 -1.570484 -0.213470   0.297259     0.789632  0.816536 -0.073328      0.0   
412  0.590109 -1.029797   0.387212     0.009528  1.843977 -0.073328      0.0   

     Month_2  Month_3  Month_4  Month_5

In [403]:
def cost_function(x, y, weight = 1):
    y_hat = np.dot(x, weight)
    error = y - y_hat
    cost = np.sqrt(np.mean((error / x.shape[0]) ** 2, axis = 0))
    return cost

In [404]:
def RidgeRegression(x, y, weight, learning_rate=0.0001, lambdas=np.logspace(-1, 5, 100).reshape(-1, 1), iterations=50):
    cost = []
    lambdas = lambdas.T
    for i in range(iterations):
        y_hat = np.dot(x, weight)
        error = y - y_hat

        gradient = np.dot(x.T, error) - weight * lambdas

        weight += learning_rate * gradient

        if  i % 10 == 0:
            cost.append(cost_function(x, y, weight))
            print(f"Iteration: {i}, Error: {cost[-1]}")

    return cost[-1], weight, lambdas

In [405]:
train_weight = np.ones((train_data_set.shape[1], 100))

train_errors, train_weight, train_lambdas = RidgeRegression(train_data_set, train_set_y, train_weight)

min_index = np.argmin(train_errors)

fin_lambda = train_lambdas[0][np.argmin(train_errors)]
fin_weight = train_weight[:, np.argmin(train_errors)]
print(f"가장 작은 에러값: {np.min(train_errors)}")
print(f"가장 fit한 lambda: {fin_lambda}")
print(f"가장 fit한 weight: {fin_weight}")

Iteration: 0, Error: [0.00713654 0.00713653 0.00713651 0.0071365  0.00713648 0.00713647
 0.00713645 0.00713642 0.00713639 0.00713636 0.00713633 0.00713628
 0.00713623 0.00713618 0.00713612 0.00713604 0.00713596 0.00713586
 0.00713575 0.00713562 0.00713547 0.0071353  0.00713511 0.00713488
 0.00713463 0.00713433 0.00713399 0.0071336  0.00713315 0.00713263
 0.00713203 0.00713135 0.00713056 0.00712966 0.00712862 0.00712742
 0.00712605 0.00712447 0.00712265 0.00712056 0.00711816 0.0071154
 0.00711223 0.00710858 0.00710439 0.00709957 0.00709403 0.00708766
 0.00708034 0.00707193 0.00706227 0.00705116 0.0070384  0.00702375
 0.00700691 0.00698758 0.00696538 0.00693989 0.00691064 0.00687708
 0.00683859 0.00679446 0.00674389 0.00668598 0.0066197  0.00654392
 0.00645738 0.00635867 0.00624629 0.00611863 0.00597402 0.00581084
 0.00562767 0.00542353 0.00519838 0.00495389 0.00469475 0.00443094
 0.00418128 0.00397842 0.00387353 0.00393506 0.00423367 0.00481899
 0.00571071 0.00691135 0.00842335 0.010258

In [406]:
test_cost = cost_function(test_data_set, test_set_y, fin_weight)
print(test_cost[min_index])

0.013308249758721772


In [407]:
def sinusoidal_embedding(month):
    x = np.sin(2 * np.pi * month / 12)
    y = np.cos(2 * np.pi * month / 12)

    x_df = pd.DataFrame(x, columns=['Month_x'])
    y_df = pd.DataFrame(y, columns=['Month_y'])

    return x_df, y_df

In [408]:
train_si_scaled_x, train_si_scaled_y = sinusoidal_embedding(train_set_scaled['Month'])
test_si_scaled_x, test_si_scaled_y = sinusoidal_embedding(test_set_scaled['Month'])

train_set_si = train_set_scaled.copy()
test_set_si = test_set_scaled.copy()

month_pos = df.columns.get_loc('Month')

train_set_si.insert(month_pos, 'Month_x', train_si_scaled_x)
train_set_si.insert(month_pos + 1, 'Month_y', train_si_scaled_y)
test_set_si.insert(month_pos, 'Month_x', test_si_scaled_x)
test_set_si.insert(month_pos + 1, 'Month_y', test_si_scaled_y)

train_set_si.drop(['Month'], axis=1, inplace=True)
test_set_si.drop(['Month'], axis=1, inplace=True)

train_set_ones = pd.DataFrame(np.ones(len(train_set_si)), columns=['Bias'])
test_set_ones = pd.DataFrame(np.ones(len(test_set_si)), columns=['Bias'])

train_set_si = pd.concat([train_set_si, train_set_ones], axis=1)
test_set_si = pd.concat([test_set_si, test_set_ones], axis=1)

print(train_set_si)
print(test_set_si)

     X_locate  Y_locate Month_x Month_y       FFMC  Temperature      Wind  \
0   -0.274128  0.602856     NaN     NaN -12.943816    -2.365455 -1.752069   
1    1.454346  1.419183     NaN     NaN   0.261277     1.153681 -1.238348   
2    1.454346  0.602856     NaN     NaN   0.441183     1.379044 -0.496307   
3    0.590109  0.602856     NaN     NaN   0.045390     0.425584 -1.010027   
4    0.157990 -0.213470     NaN     NaN   0.747023     0.252227 -0.724627   
..        ...       ...     ...     ...        ...          ...       ...   
408 -0.706247 -0.213470     NaN     NaN  -0.188487     0.858975 -0.210906   
409 -0.274128 -0.213470     NaN     NaN   0.800995     1.708421  0.017414   
410 -1.138366  0.602856     NaN     NaN   0.818986     0.928317 -0.724627   
411 -1.570484 -0.213470     NaN     NaN   0.297259     0.789632  0.816536   
412  0.590109 -1.029797     NaN     NaN   0.387212     0.009528  1.843977   

         Rain  Bias  
0   -0.073328   1.0  
1   -0.073328   1.0  
2   -0.07

In [409]:
train_si_weight = np.ones((train_set_si.shape[1], 100))
test_si_weight = np.ones((test_set_si.shape[1], 100))

train_errors, train_weight, train_lambdas = RidgeRegression(train_data_set, train_set_y, train_weight)
fin_lambda = train_lambdas[0][np.argmin(train_errors)]
fin_weight = train_weight[:, np.argmin(train_errors)]
print(f"가장 작은 에러값: {np.min(train_errors)}")
print(f"가장 fit한 lambda: {fin_lambda}")
print(f"가장 fit한 weight: {fin_weight}")

Iteration: 0, Error: [3.39644248e-03 3.39643161e-03 3.39641912e-03 3.39640477e-03
 3.39638827e-03 3.39636932e-03 3.39634754e-03 3.39632252e-03
 3.39629378e-03 3.39626076e-03 3.39622285e-03 3.39617931e-03
 3.39612933e-03 3.39607196e-03 3.39600613e-03 3.39593061e-03
 3.39584401e-03 3.39574474e-03 3.39563101e-03 3.39550077e-03
 3.39535171e-03 3.39518125e-03 3.39498646e-03 3.39476408e-03
 3.39451050e-03 3.39422169e-03 3.39389326e-03 3.39352042e-03
 3.39309801e-03 3.39262058e-03 3.39208246e-03 3.39147792e-03
 3.39080140e-03 3.39004782e-03 3.38921304e-03 3.38829446e-03
 3.38729189e-03 3.38620859e-03 3.38505277e-03 3.38383935e-03
 3.38259230e-03 3.38134748e-03 3.38015606e-03 3.37908858e-03
 3.37823956e-03 3.37773253e-03 3.37772526e-03 3.37841459e-03
 3.38004023e-03 3.38288661e-03 3.38728131e-03 3.39358884e-03
 3.40219832e-03 3.41350395e-03 3.42787819e-03 3.44563883e-03
 3.46701310e-03 3.49210373e-03 3.52086338e-03 3.55308390e-03
 3.58840485e-03 3.62634176e-03 3.66632956e-03 3.70777181e-03
 3.

In [410]:
test_cost = cost_function(test_data_set, test_set_y, fin_weight)
print(test_cost[min_index])

0.013210122850806303
