In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# 라이브러리
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from tqdm import tqdm
import random

# TS test & plot
import statsmodels.graphics.tsaplots as sgt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm

# 모델 fitting
from tqdm import tqdm
from sklearn.covariance import LedoitWolf
from scipy.stats import multivariate_normal

# Warnings
import warnings
warnings.filterwarnings('ignore')

#### 데이터 불러오기

In [None]:
data_iter = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/기상/기상보간iter.csv', encoding='UTF-8')
data_linear = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/기상/기상보간linear.csv', encoding='UTF-8')

In [None]:
data_iter_index = data_iter.set_index('일시').copy()
data_linear_index = data_linear.set_index('일시').copy()

In [None]:
data_pear = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/개화/배 개화 데이터.csv',encoding = 'cp949')
data_peach = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/개화/복숭아 개화 데이터.csv',encoding = 'cp949')

#### 전처리

In [None]:
data_iter_fin = data_iter.copy()
data_linear_index_fin = data_linear_index.copy()

In [None]:
data_iter_fin = data_iter[data_iter['지점명'].isin(data_pear['지점'])]
data_linear_index_fin = data_linear_index[data_linear_index['지점명'].isin(data_pear['지점'])]

In [None]:
columns_to_keep = [
    '일시','지점명','일 최심적설(cm)', '합계 3시간 신적설(cm)', '일강수량(mm)', '일 최심신적설(cm)', '평균기온(°C)',
    '최저기온(°C)', '최고기온(°C)', '최대 순간 풍속(m/s)', '최대 풍속(m/s)', '평균 풍속(m/s)', '평균 이슬점온도(°C)',
    '최소 상대습도(%)', '평균 상대습도(%)', '평균 증기압(hPa)', '평균 현지기압(hPa)', '합계 일조시간(hr)',
    '평균 전운량(1/10)', '평균 지면온도(°C)', '최저 초상온도(°C)'
]

data_iter_fin = data_iter_fin[columns_to_keep]
#data_linear_index_fin = data_linear_index_fin[columns_to_keep]

In [None]:
data_iter_fin.지점명.unique()

array(['속초', '철원', '대관령', '춘천', '강릉', '서울', '인천', '원주', '울릉도', '수원', '충주',
       '서산', '울진', '청주', '대전', '추풍령', '안동', '포항', '군산', '대구', '전주', '울산',
       '창원', '광주', '부산', '통영', '목포', '여수', '완도', '제주', '고산', '성산', '서귀포',
       '진주', '강화', '양평', '이천', '인제', '홍천', '삼척', '태백', '제천', '보은', '천안',
       '보령', '부여', '금산', '부안', '임실', '정읍', '남원', '장수', '주암', '장흥', '해남',
       '고흥', '성산포', '봉화', '영주', '문경', '영덕', '의성', '구미', '영천', '거창', '합천',
       '밀양', '산청', '거제', '남해', '동해', '무안', '동두천', '영월', '흑산도', '파주',
       '백령도', '상주', '진도(첨찰산)', '북강릉', '고창', '순천', '고창군', '북춘천', '대구(기)',
       '홍성'], dtype=object)

#### 모델링

In [None]:
# 월 구분 없이
def simulate_multivariate_data_all_month(data , n_samples):
    random.seed(2023)

    unique_locations = data['지점명'].unique()
    simulated_data_dict = {}

    for location in tqdm(unique_locations, desc="Simulating Data by Location"):
        location_data = data[data['지점명'] == location]

        # Initialize a list to store residuals for each variable
        residuals_list = []

        for column in location_data.columns.drop(['지점명']):
            decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
            residuals = decomposition.resid
            residuals = residuals.dropna()
            residuals_list.append(residuals)

        all_residuals = pd.concat(residuals_list, axis=1)

        # Fit a multivariate normal distribution to the combined residuals
        means = all_residuals.mean()
        covariance_estimator = LedoitWolf()
        covariance_matrix = covariance_estimator.fit(all_residuals.values).covariance_

        # Define the multivariate normal distribution
        mvn = multivariate_normal(mean = means, cov = covariance_matrix)
        simulated_data = mvn.rvs(size = n_samples)

        # Create a DataFrame for the simulated data for this location
        simulated_location_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['지점명']))

        # Add back trend and seasonality to obtain simulated data for each variable
        for column in location_data.columns.drop(['지점명']):
            decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
            simulated_location_data_df[column] += decomposition.seasonal.values[:n_samples] + \
                decomposition.trend.values[:n_samples]

        # Add the '지점명' column back to the DataFrame
        simulated_location_data_df['지점명'] = location
        simulated_location_data_df = simulated_location_data_df.dropna()

        simulated_data_dict[location] = simulated_location_data_df

    return simulated_data_dict

In [None]:
simulated_data_dict = simulate_multivariate_data_all_month(data_iter_fin.drop(['일시'],axis=1), 11) # 월하는 거보다 하나 더 넣어야함
simulated_data_dict

Simulating Data by Location: 100%|██████████| 86/86 [00:11<00:00,  7.76it/s]


{'속초':     일 최심적설(cm)  합계 3시간 신적설(cm)  일강수량(mm)  일 최심신적설(cm)  평균기온(°C)   최저기온(°C)  \
 1    -0.004163        0.135975  2.962407    -0.038219 -3.145459  -7.038291   
 2     0.064235       -1.793057  0.066137    -1.637647 -3.328829  -7.004410   
 3     2.417812        2.812059  2.316177     2.973130 -2.901958  -8.401620   
 4     4.104500        2.885796  1.686721     3.046317 -1.204540  -3.612627   
 5     3.359558        2.726147  1.994342     3.003833 -1.317858  -4.023068   
 6     2.700458        1.942242  6.759952     1.597739 -4.715364 -10.301048   
 7     1.602048        0.997013  2.744728     1.025730 -5.214525  -7.730810   
 8    -3.392823       -3.144476 -6.266542    -2.592744  0.798038  -4.728555   
 9    -0.332825       -0.969178 -0.291464    -0.967095 -3.041345  -6.386182   
 10   -1.071572       -3.168570 -6.912474    -3.074016  3.028680  -2.277940   
 
     최고기온(°C)  최대 순간 풍속(m/s)  최대 풍속(m/s)  평균 풍속(m/s)  평균 이슬점온도(°C)  최소 상대습도(%)  \
 1   2.560038      24.265738   13.414766 

In [None]:
from tqdm import tqdm
import statsmodels.api as sm
from sklearn.covariance import LedoitWolf
from scipy.stats import multivariate_normal

def simulate_multivariate_data_custom_month(data, n_samples):
    random.seed(2023)

    unique_locations = data['지점명'].unique()
    unique_months = data['일시'].str.split('-', expand=True)[1].unique()
    simulated_data_dict = {}

    for month in tqdm(unique_months, desc="Simulating Data by Month"):
        for location in unique_locations:
            location_data = data[(data['지점명'] == location) & (data['일시'].str.split('-', expand=True)[1] == month)]

            # Initialize a list to store residuals for each variable
            residuals_list = []

            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                residuals = decomposition.resid
                residuals = residuals.dropna()
                residuals_list.append(residuals)

            all_residuals = pd.concat(residuals_list, axis=1)

            # Fit a multivariate normal distribution to the combined residuals
            means = all_residuals.mean()
            covariance_estimator = LedoitWolf()
            covariance_matrix = covariance_estimator.fit(all_residuals.values).covariance_

            # Define the multivariate normal distribution
            mvn = multivariate_normal(mean=means, cov=covariance_matrix)
            simulated_data = mvn.rvs(size=n_samples)

            # Create a DataFrame for the simulated data for this location and month
            simulated_location_month_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['지점명', '일시']))

            # Add back trend and seasonality to obtain simulated data for each variable
            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                simulated_location_month_data_df[column] += decomposition.seasonal.values[:len(simulated_location_month_data_df)] + \
                    decomposition.trend.values[:len(simulated_location_month_data_df)]

            # Add the '지점명' column back to the DataFrame
            simulated_location_month_data_df['지점명'] = location
            simulated_location_month_data_df['month'] = month
            simulated_location_month_data_df = simulated_location_month_data_df.dropna()

            # Store the simulated data in the dictionary
            key = f"{location}_{month}"
            simulated_data_dict[key] = simulated_location_month_data_df

    return simulated_data_dict

# Usage:
n_samples = 30  # Specify the number of samples
simulated_data_dict_monthly = simulate_multivariate_data_custom_month(data_iter_fin, n_samples)

Simulating Data by Month: 100%|██████████| 3/3 [04:21<00:00, 87.19s/it]


In [None]:
# Customize input
def simulate_multivariate_data_custom_month(data, n_samples):
    random.seed(2023)

    unique_locations = data['지점명'].unique()
    unique_months = data['일시'].str.split('-', expand=True)[1].unique()
    simulated_data_dict = {}

    for month in tqdm(unique_months, desc="Simulating Data by Month"):
        for location in unique_locations:
            location_data = data[(data['지점명'] == location) & (data['일시'].str.split('-', expand=True)[1] == month)]

            # Initialize a list to store residuals for each variable
            residuals_list = []

            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                residuals = decomposition.resid
                residuals = residuals.dropna()
                residuals_list.append(residuals)

            all_residuals = pd.concat(residuals_list, axis=1)

            # Fit a multivariate normal distribution to the combined residuals
            means = all_residuals.mean()
            covariance_estimator = LedoitWolf()
            covariance_matrix = covariance_estimator.fit(all_residuals.values).covariance_

            # Define the multivariate normal distribution
            mvn = multivariate_normal(mean=means, cov=covariance_matrix)
            simulated_data = mvn.rvs(size=n_samples)

            # Create a DataFrame for the simulated data for this location and month
            simulated_location_month_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['지점명', '일시']))

            # Add back trend and seasonality to obtain simulated data for each variable
            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                simulated_location_month_data_df[column] += decomposition.seasonal.values[:len(simulated_location_month_data_df)] + \
                    decomposition.trend.values[:len(simulated_location_month_data_df)]

            # Add the '지점명' column back to the DataFrame
            simulated_location_month_data_df['지점명'] = location
            simulated_location_month_data_df['month'] = month
            simulated_location_month_data_df = simulated_location_month_data_df.dropna()

            # Store the simulated data in the dictionary
            key = f"{location}_{month}"
            simulated_data_dict[key] = simulated_location_month_data_df

    return simulated_data_dict

# Usage:
n_samples = 31  # Specify the number of samples
simulated_data_dict_monthly = simulate_multivariate_data_custom_month(data_iter_fin, n_samples)

In [None]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import multivariate_normal
from sklearn.covariance import LedoitWolf
import statsmodels.api as sm

def simulate_multivariate_data_custom_month(data, n_samples):
    random.seed(2023)

    unique_locations = data['지점명'].unique()
    unique_months = data['일시'].str.split('-', expand=True)[1].unique()
    simulated_data_dict = {}

    for month in tqdm(unique_months, desc="Simulating Data by Month"):
        for location in unique_locations:
            location_data = data[(data['지점명'] == location) & (data['일시'].str.split('-', expand=True)[1] == month)]

            # Initialize a list to store residuals for each variable
            residuals_list = []

            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                residuals = decomposition.resid
                residuals = residuals.dropna()
                residuals_list.append(residuals)

            all_residuals = pd.concat(residuals_list, axis = 1)

            # Fit a multivariate normal distribution to the combined residuals
            means = all_residuals.mean()
            covariance_estimator = LedoitWolf()
            covariance_matrix = covariance_estimator.fit(all_residuals.values).covariance_

            # Define the multivariate normal distribution
            mvn = multivariate_normal(mean=means, cov=covariance_matrix)
            simulated_data = mvn.rvs(size=n_samples)

            # Create a DataFrame for the simulated data for this location and month
            simulated_location_month_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['지점명', '일시']))

            # Customize the '최저기온' variable
            original_mean = location_data['최저기온(°C)'].mean()
            added_value = 0.2
            for i in range(n_samples):
                if added_value <= 1.0:
                    simulated_location_month_data_df['최저기온'][i] += original_mean + added_value
                    added_value += 0.2

            # Add back trend and seasonality to obtain simulated data for each variable
            for column in location_data.columns.drop(['지점명', '일시', '최저기온']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                simulated_location_month_data_df[column] += decomposition.seasonal.values[:len(simulated_location_month_data_df)] + \
                    decomposition.trend.values[:len(simulated_location_month_data_df)]

            # Add the '지점명' column back to the DataFrame
            simulated_location_month_data_df['지점명'] = location
            simulated_location_month_data_df['month'] = month
            simulated_location_month_data_df = simulated_location_month_data_df.dropna()

            # Store the simulated data in the dictionary
            key = f"{location}_{month}"
            simulated_data_dict[key] = simulated_location_month_data_df

    return simulated_data_dict

# Usage:
n_samples = 31  # Specify the number of samples
simulated_data_dict_monthly = simulate_multivariate_data_custom_month(data_iter_fin, n_samples)
simulated_data_dict_monthly

In [None]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import multivariate_normal
from sklearn.covariance import LedoitWolf
import statsmodels.api as sm

def simulate_multivariate_data_custom_month(data, n_samples):
    random.seed(2023)

    unique_locations = data['지점명'].unique()
    unique_months = data['일시'].str.split('-', expand=True)[1].unique()
    simulated_data_dict = {}

    mean_temp_counter = 1  # Initialize the mean temperature counter

    for month in tqdm(unique_months, desc="Simulating Data by Month"):
        for location in unique_locations:
            location_data = data[(data['지점명'] == location) & (data['일시'].str.split('-', expand=True)[1] == month)]

            # Initialize a list to store residuals for each variable
            residuals_list = []

            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                residuals = decomposition.resid
                residuals = residuals.dropna()
                residuals_list.append(residuals)

            all_residuals = pd.concat(residuals_list, axis=1)

            # Find the column index of '최저기온(°C)'
            column_index = location_data.columns.get_loc('최저기온(°C)')

            # Fit a multivariate normal distribution to the combined residuals
            means = all_residuals.mean()
            covariance_estimator = LedoitWolf()
            covariance_matrix = covariance_estimator.fit(all_residuals.values).covariance_

            # Define the multivariate normal distribution
            mvn = multivariate_normal(mean=means, cov=covariance_matrix)
            simulated_data = mvn.rvs(size=n_samples)

            # Create a DataFrame for the simulated data for this location and month
            simulated_location_month_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['지점명', '일시']))

            # Customize the '최저기온(°C)' variable based on residuals' mean
            original_residual_mean = all_residuals.iloc[:, column_index].mean()
            added_residual_value = 0.2
            for i in range(n_samples):
                if added_residual_value <= 1.0:
                    simulated_location_month_data_df.iloc[i, column_index] += original_residual_mean + added_residual_value
                    added_residual_value += 0.2

                # Add back trend and seasonality to obtain simulated data for each variable
                for column in location_data.columns.drop(['지점명', '일시', '최저기온(°C)']):
                    decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                    simulated_location_month_data_df[column] += decomposition.seasonal.values[:len(simulated_location_month_data_df)] + \
                        decomposition.trend.values[:len(simulated_location_month_data_df)]

                # Add the '지점명' column back to the DataFrame
                simulated_location_month_data_df['지점명'] = location
                simulated_location_month_data_df['month'] = month
                simulated_location_month_data_df = simulated_location_month_data_df.dropna()

                # Store the simulated data in the dictionary with MeanTemp counter
                key = f"{location}_{month}_MeanTemp{mean_temp_counter}"
                simulated_data_dict[key] = simulated_location_month_data_df

                # Increment the MeanTemp counter
                mean_temp_counter += 1

    return simulated_data_dict

# Usage:
n_samples = 31  # Specify the number of samples
simulated_data_dict_monthly = simulate_multivariate_data_custom_month(data_iter_fin, n_samples)


In [44]:
from scipy.stats import multivariate_normal
from sklearn.covariance import LedoitWolf
import statsmodels.api as sm

def simulate_multivariate_data_custom_month(data, n_samples):
    random.seed(2023)

    unique_locations = data['지점명'].unique()
    unique_months = data['일시'].str.split('-', expand=True)[1].unique()
    simulated_data_dict = {}

    for month in tqdm(unique_months, desc="Simulating Data by Month"):
        for location in unique_locations:
            location_data = data[(data['지점명'] == location) & (data['일시'].str.split('-', expand=True)[1] == month)]

            # Initialize a list to store residuals for each variable
            residuals_list = []

            for column in location_data.columns.drop(['지점명', '일시']):
                decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                residuals = decomposition.resid
                residuals = residuals.dropna()
                residuals_list.append(residuals)

            all_residuals = pd.concat(residuals_list, axis=1)

            # Find the column index of '최저기온(°C)'
            column_index = location_data.columns.get_loc('최저기온(°C)')

            # Customize the '최저기온(°C)' variable based on residuals' mean
            original_residual_mean = all_residuals.iloc[:, column_index].mean()
            customized_temp = original_residual_mean

            # Create a dictionary key based on MeanTemp value
            for i in range(n_samples):
                if customized_temp <= original_residual_mean + 1.0:
                    counter = i + 1  # Increment the counter for each temperature
                    key = f"{location}_{month}_MeanTemp{counter}"

                    # Fit a multivariate normal distribution to the combined residuals
                    means = all_residuals.mean()
                    covariance_estimator = LedoitWolf()
                    covariance_matrix = covariance_estimator.fit(all_residuals.values).covariance_

                    # Define the multivariate normal distribution
                    mvn = multivariate_normal(mean=means, cov=covariance_matrix)
                    simulated_data = mvn.rvs(size=(n_samples,))  # Generate 1 sample for each case

                    # Create a DataFrame for the simulated data for this location and month
                    simulated_location_month_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['지점명', '일시']))

                    # Add back trend and seasonality to obtain simulated data for each variable
                    for column in location_data.columns.drop(['지점명', '일시', '최저기온(°C)']):
                        decomposition = sm.tsa.seasonal_decompose(location_data[column], model='additive', period=3)
                        simulated_location_month_data_df[column] += decomposition.seasonal.values[:len(simulated_location_month_data_df)] + \
                            decomposition.trend.values[:len(simulated_location_month_data_df)]

                    # Customize the '최저기온(°C)' variable
                    simulated_location_month_data_df['최저기온(°C)'] += customized_temp

                    # Add the '지점명' column back to the DataFrame
                    simulated_location_month_data_df['지점명'] = location
                    simulated_location_month_data_df['month'] = month
                    simulated_location_month_data_df['counter'] = counter

                    # Store the simulated data in the dictionary
                    simulated_data_dict[key] = simulated_location_month_data_df

                    # Increment the customized temperature by 0.2
                    customized_temp += 0.2

    return simulated_data_dict

# Usage:
n_samples = 33
simulated_data_dict_monthly = simulate_multivariate_data_custom_month(data_iter_fin, n_samples)

Simulating Data by Month: 100%|██████████| 3/3 [05:55<00:00, 118.58s/it]


In [48]:
simulated_data_dict_monthly