In [12]:
import joblib
import warnings
import numpy as np
import pandas as pd
import utils.plot as custom_plt
from scipy import stats
from matplotlib import pyplot as plt
from sklearn.mixture import GaussianMixture
from IPython.core.interactiveshell import InteractiveShell

In [13]:
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

In [14]:
df = pd.read_csv('Database/rainfall_train.csv', index_col=0)
df_test = pd.read_csv('Database/rainfall_test.csv', index_col=0)

In [15]:
mask = df['rainfall_train.class_interval'] == -999
df = df[~mask]

In [None]:
custom_plt.plot_continuous_variable(df, 'rainfall_train.vv', 'kde', True, True)

In [None]:
for i in range(20):
    aws = df['rainfall_train.stn4contest'].unique().tolist()
    mask = (df['rainfall_train.fc_month'] == 5) & (df['rainfall_train.stn4contest'] == f'{aws[i]}')
    df_boxcox = pd.Series(stats.boxcox(df[mask]['rainfall_train.vv'] + 0.000001)[0])
    df_log = pd.DataFrame(df[mask]['rainfall_train.vv']).applymap(lambda x: np.log(x + 1))
    print(f'{aws[i]}')
    print(df_boxcox.skew())

In [20]:
X_test = pd.read_csv('Database/test/X_self_test_norm.csv', index_col=0)

In [21]:
gmm = GaussianMixture(n_components=15, max_iter=1000, init_params='k-means++', random_state=42)
gmm.fit(X_test)

In [22]:
joblib.dump(gmm, 'Database/gmm_model.pkl')

['Database/gmm_model.pkl']

In [26]:
# BIC 스코어를 저장할 리스트를 초기화합니다.
bic_scores = []

# 클러스터 개수의 범위를 설정합니다.
cluster_range = range(1, 21)

# 각 클러스터 개수에 대해 GMM을 적합시키고 BIC 스코어를 계산합니다.
for n_clusters in cluster_range:
    gmm = GaussianMixture(n_components=n_clusters, init_params='k-means++', random_state=42)
    gmm.fit(X_test)
    bic = gmm.bic(X_test)
    bic_scores.append(bic)
    print(f'Cluster count: {n_clusters}, BIC: {bic}')

# 최적의 클러스터 개수를 선택합니다.
optimal_clusters = cluster_range[np.argmin(bic_scores)]
print(f'Optimal number of clusters: {optimal_clusters}')

# BIC 스코어를 시각화합니다.
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, bic_scores, marker='o')
plt.title('BIC Scores for Different Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC Score')
plt.show()

Cluster count: 1, BIC: -6948157.661413293


Cluster count: 2, BIC: -15693136.880720615


Cluster count: 3, BIC: -32816152.903879322


Cluster count: 4, BIC: -38166143.44337646


Cluster count: 5, BIC: -38592416.364277005


Cluster count: 6, BIC: -41744356.122960724


Cluster count: 7, BIC: -41725500.002784654


Cluster count: 8, BIC: -44312861.70784456


Cluster count: 9, BIC: -46101188.09087579


Cluster count: 10, BIC: -46651826.61049681


Cluster count: 11, BIC: -49124357.87284247


Cluster count: 12, BIC: -50185832.82612471


Cluster count: 13, BIC: -50545050.61573802


Cluster count: 14, BIC: -50576877.759511635



KeyboardInterrupt

