In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# 라이브러리
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from tqdm import tqdm
import random

# TS test & plot
import statsmodels.graphics.tsaplots as sgt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm

# 모델 fitting
from scipy.stats import norm, t, f, chi2, cosine, alpha, beta, gamma, dgamma, dweibull, maxwell, pareto, fisk, expon, lognorm, ncx2, cauchy, wishart


# 모델 평가 지표
from sklearn.metrics import f1_score, recall_score, matthews_corrcoef, accuracy_score, precision_score, confusion_matrix

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 폰트 깨짐 방지
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

#!sudo apt-get install -y fonts-nanum
#!sudo fc-cache -fv
#!rm ~/.cache/matplotlib -rf

plt.rc('font', family='NanumBarunGothic')

#### 데이터 불러오기

In [4]:
data_iter = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/기상/기상보간iter.csv', encoding='UTF-8')
data_linear = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/기상/기상보간linear.csv', encoding='UTF-8')

In [5]:
data_iter_index = data_iter.set_index('일시').copy()
data_linear_index = data_linear.set_index('일시').copy()

In [6]:
data_pear = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/개화/배 개화 데이터.csv',encoding = 'cp949')
data_peach = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/개화/복숭아 개화 데이터.csv',encoding = 'cp949')

In [None]:
set(data_peach['지점'].unique()) - set(data_pear['지점'].unique()) # 동일한 지역

set()

In [None]:
264460 - 246726

17734

#### 전처리

In [7]:
data_iter_fin = data_iter.copy()

In [8]:
data_iter_fin = data_iter[data_iter['지점명'].isin(data_pear['지점'])]

In [14]:
columns_to_keep = [
    '일 최심적설(cm)', '합계 3시간 신적설(cm)', '일강수량(mm)', '일 최심신적설(cm)', '평균기온(°C)',
    '최저기온(°C)', '최고기온(°C)', '최대 순간 풍속(m/s)', '최대 풍속(m/s)', '평균 풍속(m/s)', '평균 이슬점온도(°C)',
    '최소 상대습도(%)', '평균 상대습도(%)', '평균 증기압(hPa)', '평균 현지기압(hPa)', '합계 일조시간(hr)',
    '평균 전운량(1/10)', '평균 지면온도(°C)', '최저 초상온도(°C)'
]

data_iter_fin = data_iter_fin[columns_to_keep]

# 배

In [None]:
data_iter_fin['지점명'].unique()

In [None]:
place = ['평택', '남양주', '안성' , '영동' , '천안' , '아산' ,  '연기',  '예산', '나주', '영암',
          '김천', '상주', '진주', '울산']

In [None]:
'천안', '상주' ,'진주','울산'

In [None]:
영천시, 충주시, 경산시, 음성군, 청도군, 영동군, 이천시, 김천시, 상주시, 의성군, 옥천군, 세종시, 전주시, 원주시, 영덕군, 남원시, 괴산군, 춘천시, 임실군, 대구 동구

### 데이터 시뮬레이션 (전체 데이터에 대해)
- 분포 fitting
- 춀레스키 분해


In [8]:
def fit_distributions_with_seasonality(data, distributions, period=3):
    """
    Fit probability distributions to each variable in the input DataFrame after removing seasonality.

    Parameters:
        - data (pd.DataFrame): Input DataFrame containing the original data.
        - distributions (dict): Dictionary of distribution names and functions (e.g., {'norm': norm, 't': t}).
        - period (int): Seasonal decomposition period (default is 3).

    Returns:
        - pd.DataFrame: DataFrame containing fitted distribution parameters for each variable.
    """
    fit_results = pd.DataFrame(columns=['Variable', 'Distribution', 'Parameters'])


    for variable in tqdm(data.columns, desc="Fitting Distributions with Seasonality"):
        variable_data = data[variable]

        # Step 1: Seasonal Decomposition
        decomposition = seasonal_decompose(variable_data, model='additive', period=period)
        residual = variable_data - decomposition.trend - decomposition.seasonal
        residual = residual.dropna()  # Remove any NaN values

        # Step 2: Fit Distributions to Residuals
        distribution_fits = {}

        for distribution_name, distribution in distributions.items():
            params = distribution.fit(residual)
            distribution_fits[distribution_name] = {
                'Parameters': params
            }

        best_distribution = min(distribution_fits, key=lambda k: distribution_fits[k]['Parameters'][0])

        # Store the best-fitting distribution and its parameters in the results DataFrame
        fit_results = fit_results.append({
            'Variable': variable,
            'Distribution': best_distribution,
            'Parameters': distribution_fits[best_distribution]['Parameters']
        }, ignore_index=True)

        print(fit_results)

    return fit_results

In [9]:
def simulate_correlated_data_with_seasonality(data, fit_results, correlation_matrix, n_samples=1000, period=3):
    """
    Simulate correlated data based on fitted distributions, correlation matrix, and number of samples, preserving seasonality.

    Parameters:
        - data (pd.DataFrame): Input DataFrame containing the original data.
        - fit_results (pd.DataFrame): DataFrame with fitted distribution parameters for each variable.
        - correlation_matrix (pd.DataFrame): Correlation matrix for the input data.
        - n_samples (int): Number of samples to generate (default is 1000).
        - period (int): Seasonal decomposition period (default is 3).

    Returns:
        - pd.DataFrame: Simulated data with correlations and seasonality similar to the original dataset.
    """
    simulated_data = pd.DataFrame(columns=data.columns)
    cholesky_decomp = np.linalg.cholesky(correlation_matrix)

    for _ in tqdm(range(n_samples), desc="Simulating Data"):
        independent_samples = np.random.randn(len(data.columns))
        correlated_samples = np.dot(cholesky_decomp, independent_samples)
        simulated_row = {}

        for i, col in enumerate(data.columns):
            best_distribution = fit_results[fit_results['Variable'] == col]['Distribution'].values[0]
            params = fit_results[fit_results['Variable'] == col]['Parameters'].values[0]
            simulated_value = best_distribution(*params).rvs(size=1)

            # Adding back the seasonal component
            decomposition = seasonal_decompose(simulated_value, model='additive', period=period)
            simulated_value_with_seasonality = simulated_value + decomposition.seasonal.values[0]

            simulated_row[col] = simulated_value_with_seasonality

        simulated_data = simulated_data.append(simulated_row, ignore_index=True)

    return simulated_data

In [None]:
random.seed(108)

# Step 1: Calculate Correlation Matrix
correlation_matrix = data_iter_fin.corr()

# Step 2: Fit Distributions
distributions = {
    'norm': norm, 't': t, 'f': f, 'chi2': chi2, 'cosine': cosine, 'alpha': alpha, 'beta': beta, 'gamma': gamma, 'dgamma': dgamma,
    'dweibull': dweibull, 'pareto': pareto, 'fisk': fisk, 'expon': expon, 'lognorm': lognorm, 'ncx2': ncx2, 'cauchy': cauchy
}

fit_results = fit_distributions_with_seasonality(data_iter_fin, distributions)

In [25]:
def simulate_correlated_data_by_region(data, fit_results, correlation_matrix, n_samples = 1000, period=3):
    """
    Simulate correlated data by region based on fitted distributions, correlation matrix, and number of samples, preserving seasonality.

    Parameters:
        - data (pd.DataFrame): Input DataFrame containing the original data.
        - fit_results (pd.DataFrame): DataFrame with fitted distribution parameters for each variable.
        - correlation_matrix (pd.DataFrame): Correlation matrix for the input data.
        - n_samples (int): Number of samples to generate (default is 1000).
        - period (int): Seasonal decomposition period (default is 3).

    Returns:
        - dict: A dictionary where keys are region names and values are DataFrames of simulated data.
    """
    simulated_data_by_region = {}

    # Get the unique regions in the data
    unique_regions = data['지역'].unique()

    for region in unique_regions:
        region_data = data[data['지역'] == region]
        simulated_data = pd.DataFrame(columns=region_data.columns)
        cholesky_decomp = np.linalg.cholesky(correlation_matrix)

        for _ in tqdm(range(n_samples), desc=f"Simulating Data for {region}"):
            independent_samples = np.random.randn(len(region_data.columns))
            correlated_samples = np.dot(cholesky_decomp, independent_samples)
            simulated_row = {}

            for i, col in enumerate(region_data.columns):
                best_distribution = fit_results[fit_results['Variable'] == col]['Distribution'].values[0]
                params = fit_results[fit_results['Variable'] == col]['Parameters'].values[0]
                simulated_value = best_distribution(*params).rvs(size=1)

                # Adding back the seasonal component
                decomposition = seasonal_decompose(simulated_value, model='additive', period=period)
                simulated_value_with_seasonality = simulated_value + decomposition.seasonal.values[0]

                simulated_row[col] = simulated_value_with_seasonality

            simulated_data = simulated_data.append(simulated_row, ignore_index=True)

        simulated_data_by_region[region] = simulated_data

    return simulated_data_by_region

# Example usage:
# simulated_data_by_region = simulate_correlated_data_by_region(data_iter_fin, fit_results, correlation_matrix, n_samples)

In [None]:
simulated_data_by_region = simulate_correlated_data_by_region(data_iter_fin, fit_results, correlation_matrix, n_samples)

In [26]:
# Step 3: Simulation with Correlations
n_samples = 1000
simulated_data = simulate_multivariate_normal_data(data_iter_fin, fit_results, correlation_matrix, n_samples) # 여기서 뽑을 때 지역별로 뽑으면 될듯

NameError: ignored

In [11]:
# 다변량 정규분포에서 Simulation  -> 원리는 잘 모르겠으나 엄청 빠름

random.seed(2023)
from sklearn.covariance import LedoitWolf
from scipy.stats import multivariate_normal

unique_locations = data_iter['지점명'].unique()

# Store simulated data for each location in a dictionary
simulated_data_dict = {}

for location in tqdm(unique_locations, desc="Simulating Data by Location"):
    location_data = data_iter[data_iter['지점명'] == location]

    correlation_matrix = location_data.corr()

    # Fit a multivariate normal distribution to the data
    means = location_data.mean()
    covariance_estimator = LedoitWolf()
    covariance_matrix = covariance_estimator.fit(location_data.drop(columns=['일시', '지점명'])).covariance_

    # Define the multivariate normal distribution
    mvn = multivariate_normal(mean=means, cov=covariance_matrix)

    # Simulate data from the multivariate normal distribution
    n_samples = len(location_data)
    simulated_data = mvn.rvs(size=n_samples)

    simulated_data_df = pd.DataFrame(simulated_data, columns=location_data.columns.drop(['일시', '지점명']))
    simulated_data_dict[location] = simulated_data_df


Simulating Data by Location: 100%|██████████| 102/102 [00:12<00:00,  7.99it/s]


In [92]:
import random
import pandas as pd
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.stats import multivariate_normal
from statsmodels.tsa.seasonal import seasonal_decompose

def simulate_multivariate_normal_data(data, means_for_lowest_temp):
    """
    Simulate multivariate normal data based on the input DataFrame, with an option to manually set means for '최저기온(°C)'.

    Parameters:
        - data (pd.DataFrame): Input DataFrame containing the original data.
        - means_for_lowest_temp (pd.Series or None): Manually set means for '최저기온(°C)'. If None, means are computed from data.

    Returns:
        - dict: A dictionary where keys are location names and values are DataFrames of simulated data for each location.
    """
    random.seed(2023)
    unique_locations = data['지점명'].unique()

    # Store simulated data for each location in a dictionary
    simulated_data_dict = {}

    for location in tqdm(unique_locations, desc="Simulating Data by Location"):
        location_data = data[data['지점명'] == location]

        correlation_matrix = location_data.corr()

        if means_for_lowest_temp is None:
            # Compute means from data
            means = location_data.mean()
        else:
            # Use manually input means for '최저기온(°C)'
            means = location_data.mean()
            means['최저기온(°C)'] = means_for_lowest_temp[location]

        # Decompose original data to get seasonal and trend components
        original_time_series = location_data['일시']
        decomposition = seasonal_decompose(original_time_series, model='additive', period=365)  # Adjust period as needed
        seasonal_component = decomposition.seasonal.values
        trend_component = decomposition.trend.values

        # Calculate residuals
        residuals = location_data.drop(columns=['일시', '지점명']) - seasonal_component[:, np.newaxis] - trend_component[:, np.newaxis]

        # Fit a multivariate normal distribution to the residuals
        covariance_estimator = LedoitWolf()
        covariance_matrix = covariance_estimator.fit(residuals).covariance_

        # Define the multivariate normal distribution
        mvn = multivariate_normal(mean=means, cov=covariance_matrix)

        # Simulate data from the multivariate normal distribution
        n_samples = len(location_data)
        simulated_data = mvn.rvs(size=n_samples)

        # Add seasonal and trend components to the simulated data
        simulated_data_with_season_trend = simulated_data + seasonal_component + trend_component

        simulated_data_df = pd.DataFrame(simulated_data_with_season_trend, columns=location_data.columns.drop(['일시', '지점명']))
        simulated_data_dict[location] = simulated_data_df

    return simulated_data_dict


In [59]:
import random
import pandas as pd
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.stats import multivariate_normal
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.impute import SimpleImputer


def simulate_multivariate_normal_data(data, means_for_lowest_temp):
    """
    Simulate multivariate normal data based on the input DataFrame, with an option to manually set means for '최저기온(°C)'.

    Parameters:
        - data (pd.DataFrame): Input DataFrame containing the original data.
        - means_for_lowest_temp (pd.Series or None): Manually set means for '최저기온(°C)'. If None, means are computed from data.

    Returns:
        - dict: A dictionary where keys are location names and values are DataFrames of simulated data for each location.
    """
    random.seed(2023)
    unique_locations = data['지점명'].unique()
    simulated_data_dict = {}

    for location in tqdm(unique_locations, desc="Simulating Data by Location"):

        location_data = data[data['지점명'] == location]
        correlation_matrix = location_data.corr()

        if means_for_lowest_temp is None:
            means = location_data.mean() # Compute means from data
        else:
            # Use manually input means for '최저기온(°C)'
            means = location_data.mean()
            means['최저기온(°C)'] = means_for_lowest_temp[location]

        # Convert '일시' column to datetime format
        location_data['일시'] = pd.to_datetime(location_data['일시'])

        original_time_series = location_data.set_index('일시')['최저기온(°C)']
        decomposition = seasonal_decompose(original_time_series, model='additive', period=3)
        seasonal_component = decomposition.seasonal.values
        trend_component = decomposition.trend.values

        residuals = original_time_series - trend_component[:, np.newaxis] - seasonal_component[:, np.newaxis]
        residuals = residuals.dropna()  # Remove any NaN values

        # Fit a multivariate normal distribution to the residuals
        covariance_estimator = LedoitWolf()
        covariance_matrix = covariance_estimator.fit(residuals).covariance_

        # Define the multivariate normal distribution
        mvn = multivariate_normal(mean=means, cov=covariance_matrix)

        # Simulate data from the multivariate normal distribution
        n_samples = len(location_data)
        simulated_data = mvn.rvs(size=n_samples)

        # Add seasonal and trend components to the simulated data
        simulated_data_with_season_trend = simulated_data + seasonal_component + trend_component

        simulated_data_df = pd.DataFrame(simulated_data_with_season_trend, columns=location_data.columns.drop(['일시', '지점명', '최저기온(°C)']))
        simulated_data_dict[location] = simulated_data_df

    return simulated_data_dict


In [91]:
import random
import pandas as pd
import numpy as np
from sklearn.covariance import LedoitWolf
from scipy.stats import multivariate_normal
from statsmodels.tsa.seasonal import seasonal_decompose
from tqdm import tqdm

def simulate_multivariate_normal_data(data, means_for_lowest_temp=None):
    """
    Simulate multivariate normal data based on the input DataFrame, with an option to manually set means for '최저기온(°C)'.

    Parameters:
        - data (pd.DataFrame): Input DataFrame containing the original data.
        - means_for_lowest_temp (dict or None): Manually set means for '최저기온(°C)' for each location. If None, means are computed from data.

    Returns:
        - dict: A dictionary where keys are location names and values are DataFrames of simulated data for each location.
    """
    random.seed(2023)
    unique_locations = data['지점명'].unique()
    simulated_data_dict = {}

    for location in tqdm(unique_locations, desc="Simulating Data by Location"):

        location_data = data[data['지점명'] == location].drop(columns=['지점명'], axis=1)
        correlation_matrix = location_data.corr()

        if means_for_lowest_temp is None:
            means = location_data.mean() # Compute means from data
        else:
            # Use manually input means for '최저기온(°C)' if available, otherwise compute from data
            if location in means_for_lowest_temp:
                means = location_data.mean()
                means['최저기온(°C)'] = means_for_lowest_temp[location]
            else:
                means = location_data.mean()

        # Calculate residuals for each location
        n_samples = len(location_data)
        decomposition = seasonal_decompose(location_data['최저기온(°C)'], model='additive', period=3)
        seasonal_component = decomposition.seasonal.values.astype(float)[:n_samples]
        trend_component = decomposition.trend.values.astype(float)[:n_samples]
        residuals = location_data.drop(columns=['최저기온(°C)'], axis=1).astype(float) - trend_component[:, np.newaxis] - seasonal_component[:, np.newaxis]

        residuals = residuals.dropna()  # Remove any NaN values

        # Calculate the covariance matrix for the entire dataset
        covariance_estimator = LedoitWolf()
        covariance_matrix = covariance_estimator.fit(location_data).covariance_

        # Define the multivariate normal distribution
        mvn = multivariate_normal(mean=means, cov=covariance_matrix)

        # Simulate data from the multivariate normal distribution
        n_samples = len(location_data)
        simulated_data = mvn.rvs(size=n_samples)

        # Add seasonal and trend components to the simulated data
        simulated_data_with_season_trend = simulated_data + seasonal_component[:n_samples] + trend_component[:n_samples]

        # Create a DataFrame with '일시' and simulated data
        simulated_data_df = pd.DataFrame(simulated_data_with_season_trend, columns=location_data.columns.drop(['최저기온(°C)']))
        simulated_data_df['일시'] = location_data['일시'].values[:n_samples]  # Retain '일시' column

        simulated_data_dict[location] = simulated_data_df

    return simulated_data_dict

# Create a dictionary where keys are location names and values are the mean values for '최저기온(°C)'
means_for_lowest_temp = {
    'Location1': 1.0,
    'Location2': 2.0,
    # Add more locations and mean values as needed
}

# Example usage:
simulated_data_dict = simulate_multivariate_normal_data(data_iter.drop(['해양'], axis=1), means_for_lowest_temp)


Simulating Data by Location:   0%|          | 0/102 [00:00<?, ?it/s]


ValueError: ignored

In [77]:
# Create a dictionary where keys are location names and values are the same mean value
mean_value = 1  # Replace with your desired mean value
means_for_lowest_temp = {location: mean_value for location in data_iter['지점명'].unique()}

# Example usage:
simulated_data_dict = simulate_multivariate_normal_data(data_iter.drop(['일시'], axis = 1), means_for_lowest_temp)

Simulating Data by Location:   0%|          | 0/102 [00:00<?, ?it/s]


ValueError: ignored

In [39]:
data_itera = data_iter.drop(['일시', '해양'], axis=1)

In [None]:
simulated_data_dict