<a href="https://colab.research.google.com/github/hojuna/Black-box_Optimization_dacon/blob/main/KMeans_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import math

def calculate_standard_deviation(data):
    # 데이터가 비어있는 경우
    if len(data) == 0:
        return None

    # 평균 계산
    mean = sum(data) / len(data)

    # 분산 계산
    variance = sum((x - mean) ** 2 for x in data) / len(data)

    # 표준편차 계산
    standard_deviation = math.sqrt(variance)

    return standard_deviation


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Reduce Memory Usage
# reference : https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN

def reduce_memory_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2
    print("Memory usage became: ",mem_usg," MB")

    return df

In [4]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.model_selection import RandomizedSearchCV


# Load the data
train_df_org = pd.read_csv('/content/drive/MyDrive/dacon/2024_8_data/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/dacon/2024_8_data/test.csv')

In [5]:
train_df_org_reduced = reduce_memory_usage(train_df_org)
test_df_reduced = reduce_memory_usage(test_df)

Memory usage of dataframe is 3.98 MB
Memory usage became:  2.38531494140625  MB
Memory usage of dataframe is 0.46 MB
Memory usage became:  0.27829742431640625  MB


In [28]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# 데이터 정규화 (KMeans는 거리 기반이므로 정규화가 중요함)
scaler = StandardScaler()
scaled_train_data = scaler.fit_transform(train_df_org_reduced.drop(columns=['ID','y']))
scaled_test_data = scaler.transform(test_df_reduced.drop(columns=['ID']))

# 데이터프레임 복사


# KMeans 군집화 (군집의 개수는 예를 들어 3으로 설정)
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(scaled_train_data)

# 군집 라벨 얻기
labels = kmeans.labels_

# 데이터프레임에 군집 라벨 추가
train_df_org_reduced['cluster'] = labels
test_df_reduced['cluster'] = kmeans.fit_predict(scaled_test_data)

train_df_org_reduced = train_df_org_reduced[['ID', 'x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'cluster', 'y']]



  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# # 각 군집의 중심값 (Centroids) 출력
# centroids = kmeans.cluster_centers_
# centroids_df = pd.DataFrame(scaler.inverse_transform(centroids), columns=df.columns[1:-1])
# print("Cluster Centroids:")
# print(centroids_df)

# # 각 군집의 통계량 출력
# # cluster_summary = df.groupby('cluster').mean()
# # print("\nCluster Summary:")
# # print(cluster_summary)


Cluster Centroids:
        x_0       x_1       x_2       x_3       x_4       x_5       x_6  \
0  1.064014 -2.342666  1.199317  0.852455 -0.265161 -1.791455  0.489350   
1  0.989746 -1.709217  1.210965  0.927032 -0.322668 -1.720050  0.484691   

        x_7       x_8       x_9      x_10          y  correlation_std_dev  
0 -0.183727  0.669396  0.270829  0.088290  82.704386              0.49471  
1 -0.112604  0.595645  0.352332  0.044149  86.992931              0.49471  


In [9]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# PCA를 사용하여 데이터 차원 축소 (2차원)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_train_data)

# PCA 결과를 데이터프레임에 추가
df_pca = pd.DataFrame(pca_result, columns=['pca1', 'pca2'])
df_pca['cluster'] = labels

# 시각화
plt.figure(figsize=(8, 6))
for cluster in df_pca['cluster'].unique():
    cluster_data = df_pca[df_pca['cluster'] == cluster]
    plt.scatter(cluster_data['pca1'], cluster_data['pca2'], label=f'Cluster {cluster}')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('PCA of Clustering Results')
plt.legend()
plt.show()


  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor


ValueError: need at least one array to concatenate

<Figure size 800x600 with 1 Axes>

In [10]:
def train_split(df):
  # df_shuffled= train_df_org
  df_shuffled= df

  # 균등하게 나누기 위해 임의로 데이터를 8:2로 분할
  split_index = int(len(df_shuffled) * 0.8)
  train_df, valiation_df = df_shuffled[:split_index], df_shuffled[split_index:]
  return train_df, valiation_df

In [14]:

def val_score(model, valiation_df):
  id_y=pd.DataFrame()

  top_5_percent_valiation_df = valiation_df.sort_values(by='y').reset_index(drop=True)[-int(0.05 * len(valiation_df)):]
  #id만 분리 valiation_df, y값 제거
  X = valiation_df.iloc[:, 1:-2]
  id_y['ID']=valiation_df.iloc[:,0]

  #valiation_df y값 예측
  y_pred=model.predict(X)
  id_y['y_pred']=y_pred

  # print(id_x.head)
  # #id 복원

  #상위 10% valiation_df 구하기
  top_10_percent_pred_valiation=id_y.sort_values(by='y_pred').reset_index(drop=True)[-int(0.10 * len(id_y)):]

  # ID 값에서 공백 제거
  top_5_percent_valiation_df['ID'] = top_5_percent_valiation_df['ID'].str.strip()
  top_10_percent_pred_valiation['ID'] = top_10_percent_pred_valiation['ID'].str.strip()

  #top_5_percent_valiation_df 중  상위 10%에 포함되는 비율을 구해서 반환
  return top_5_percent_valiation_df['ID'].isin(top_10_percent_pred_valiation['ID']).sum()/len(top_5_percent_valiation_df)

In [10]:
train_df_org_reduced.iloc[:, 1:-1]

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,cluster
0,1.005859,-1.962891,1.247070,0.926270,-0.265869,-1.789062,0.469971,-0.139526,0.624023,0.320312,0.078613,2
1,1.024414,-2.472656,1.144531,0.846680,-0.287354,-1.756836,0.503906,-0.219604,0.697754,0.238281,0.081787,0
2,1.062500,-2.451172,1.186523,0.873535,-0.257812,-1.802734,0.498047,-0.194214,0.684082,0.259277,0.095032,0
3,1.088867,-2.458984,1.184570,0.811035,-0.276611,-1.788086,0.503418,-0.201904,0.686523,0.245728,0.091736,0
4,1.023438,-2.132812,1.242188,0.939941,-0.264404,-1.791992,0.470459,-0.142944,0.623535,0.314697,0.078979,2
...,...,...,...,...,...,...,...,...,...,...,...,...
40113,1.071289,-2.162109,1.243164,0.923828,-0.259277,-1.794922,0.474365,-0.157349,0.629883,0.308350,0.083313,2
40114,1.104492,-2.281250,1.226562,0.902344,-0.250732,-1.816406,0.479004,-0.174683,0.663574,0.281250,0.094238,2
40115,1.084961,-2.142578,1.244141,0.891602,-0.248779,-1.807617,0.478271,-0.168213,0.650391,0.293213,0.089905,2
40116,0.994141,-1.562500,1.227539,0.908203,-0.313965,-1.717773,0.477295,-0.109253,0.589355,0.354004,0.042603,1


In [12]:
theilsen_params = {
'fit_intercept': [True, False],
'max_subpopulation': [5000, 10000, 20000],
'max_iter': [100, 300, 500, 1000],
'tol': [1e-5, 1e-4, 1e-3]
}

In [29]:
import joblib
from sklearn.linear_model import TheilSenRegressor



# 클러스터 별 모델을 저장할 딕셔너리
models = {}

# 클러스터 개수
n_clusters = len(train_df_org_reduced['cluster'].unique())

# for cluster in range(n_clusters):
#     # 현재 클러스터의 데이터 추출
#     cluster_data = train_df_org_reduced[train_df_org_reduced['cluster'] == cluster]

#     train_df,valiation_df=train_split(cluster_data)

#     # 특성과 타겟 변수 정의 (여기서는 예시로 랜덤 타겟 변수를 사용)
#     X = train_df.iloc[:, 1:-2]
#     y =  train_df['y'] # 타겟 (예시)

#     # 모델 학습
#     model = TheilSenRegressor()
#     model.fit(X, y)

#     print(val_score(model,valiation_df))

#     # 모델 저장
#     models[cluster] = model

for cluster in range(n_clusters):
    # 현재 클러스터의 데이터 추출
    cluster_data = train_df_org_reduced[train_df_org_reduced['cluster'] == cluster]

    train_df,valiation_df=train_split(cluster_data)

    # 특성과 타겟 변수 정의 (여기서는 예시로 랜덤 타겟 변수를 사용)
    X = train_df.iloc[:, 1:-2]
    y =  train_df['y'] # 타겟 (예시)
    # print(X.info)
    # 모델 학습
    model = TheilSenRegressor()
    # RandomizedSearchCV 설정
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=theilsen_params,
        n_iter=30,  # 시도할 하이퍼파라미터 조합의 수
        cv=3,  # 3-fold cross-validation
        scoring='neg_mean_squared_error',  # 평가 척도
        random_state=42,  # 결과 재현을 위한 랜덤 시드
        n_jobs=-1,  # 병렬처리
        verbose=2  # 진행 상황 출력
    )

    random_search.fit(X, y)

    print(val_score(random_search.best_estimator_,valiation_df))

    # 모델 저장
    models[cluster] = random_search.best_estimator_
    joblib.dump(random_search.best_estimator_,"/content/drive/MyDrive/dacon/model/KMeans_model"+str(cluster)+"theilsen_model"+".pkl")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
0.20567375886524822
Fitting 3 folds for each of 30 candidates, totalling 90 fits
0.3225806451612903
Fitting 3 folds for each of 30 candidates, totalling 90 fits
0.3884297520661157
Fitting 3 folds for each of 30 candidates, totalling 90 fits
0.18867924528301888


In [45]:
test_df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   ID       4986 non-null   category
 1   x_0      4986 non-null   float16 
 2   x_1      4986 non-null   float16 
 3   x_2      4986 non-null   float16 
 4   x_3      4986 non-null   float16 
 5   x_4      4986 non-null   float16 
 6   x_5      4986 non-null   float16 
 7   x_6      4986 non-null   float16 
 8   x_7      4986 non-null   float16 
 9   x_8      4986 non-null   float16 
 10  x_9      4986 non-null   float16 
 11  x_10     4986 non-null   float16 
 12  cluster  4986 non-null   int32   
dtypes: category(1), float16(11), int32(1)
memory usage: 304.5 KB


In [30]:
import numpy as np

def row_predict(row):

    n=row['cluster']

    model = models[n]
    x=row.drop(['ID','cluster'])
    x=np.array(x).reshape(1,-1)
    prediction = model.predict(x)



    return prediction

id_y=pd.DataFrame()
id_y['y_pred'] = test_df_reduced.apply(row_predict, axis=1)



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


In [31]:


# Create submission file
submission_df = pd.read_csv('/content/drive/MyDrive/dacon/2024_8_data/submission/sample_submission.csv')
submission_df['y'] = id_y['y_pred']
submission_df.to_csv('/content/drive/MyDrive/dacon/2024_8_data/submission/kMeans_4_and_tuning_test_submission.csv', index=False)