In [45]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
import pandas as pd
import numpy as np
import random

from datetime import datetime
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso

In [46]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [47]:
df_train = pd.read_csv('./bigdata/train.csv')
df_test = pd.read_csv('./bigdata/test.csv')

In [48]:
df_train.head()

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,
2,SESSION_000002,USER_000002,1.0,Samsung Internet,Android,mobile,1,1.0,0.0,1,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),,
3,SESSION_000003,USER_000003,1.0,Chrome,Macintosh,desktop,1,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,,
4,SESSION_000004,USER_000004,1.0,Chrome,iOS,mobile,0,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,,Category6_Path_0000


In [49]:
df_test.head()

Unnamed: 0,sessionID,userID,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,SESSION_252289,USER_206024,Chrome,Linux,desktop,1,75.0,698.0,0,0.0,0.0,Americas,Northern America,United States,(direct),(none),,Category1
1,SESSION_252290,USER_206025,Safari,iOS,tablet,0,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
2,SESSION_252291,USER_206026,Chrome,Windows,desktop,1,1.0,33.0,0,0.0,0.0,Asia,Southern Asia,India,youtube.com,referral,,Category2_Path_0082
3,SESSION_252292,USER_206027,Chrome,Android,mobile,1,1.0,76.0,0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
4,SESSION_252293,USER_206028,Internet Explorer,Windows Phone,mobile,1,2.0,372.0,0,0.0,0.0,Asia,Southern Asia,India,youtube.com,referral,,Category11_Path_0088


In [50]:
# 필요없는 칼럼 이름을 리스트 형태로 지정
df_train['subcontinent'] = df_train.apply(lambda row: row['continent'] if pd.isna(row['subcontinent']) else row['subcontinent'], axis=1)
df_test['subcontinent'] = df_test.apply(lambda row: row['continent'] if pd.isna(row['subcontinent']) else row['subcontinent'], axis=1)

# df_train['country'] = df_train.apply(lambda row: row['subcontinent'] if pd.isna(row['country']) else row['country'], axis=1)
# df_test['country'] = df_test.apply(lambda row: row['subcontinent'] if pd.isna(row['country']) else row['country'], axis=1)

columns_to_drop = ['sessionID', 'userID','continent','country']

# 학습 데이터에서 필요없는 칼럼 제거
df_train = df_train.drop(columns=columns_to_drop, axis=1)

# 테스트 데이터에서 필요없는 칼럼 제거
df_test = df_test.drop(columns=columns_to_drop, axis=1)

In [51]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252289 entries, 0 to 252288
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   TARGET               252289 non-null  float64
 1   browser              252289 non-null  object 
 2   OS                   252289 non-null  object 
 3   device               252289 non-null  object 
 4   new                  252289 non-null  int64  
 5   quality              252289 non-null  float64
 6   duration             252289 non-null  float64
 7   bounced              252289 non-null  int64  
 8   transaction          252289 non-null  float64
 9   transaction_revenue  252289 non-null  float64
 10  subcontinent         252289 non-null  object 
 11  traffic_source       252289 non-null  object 
 12  traffic_medium       252289 non-null  object 
 13  keyword              114614 non-null  object 
 14  referral_path        91182 non-null   object 
dtypes: float64(5), in

In [52]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79786 entries, 0 to 79785
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   browser              79786 non-null  object 
 1   OS                   79786 non-null  object 
 2   device               79786 non-null  object 
 3   new                  79786 non-null  int64  
 4   quality              79786 non-null  float64
 5   duration             79786 non-null  float64
 6   bounced              79786 non-null  int64  
 7   transaction          79786 non-null  float64
 8   transaction_revenue  79786 non-null  float64
 9   subcontinent         79786 non-null  object 
 10  traffic_source       79786 non-null  object 
 11  traffic_medium       79786 non-null  object 
 12  keyword              36716 non-null  object 
 13  referral_path        25895 non-null  object 
dtypes: float64(4), int64(2), object(8)
memory usage: 8.5+ MB


In [53]:
label_columns = [
    "browser",
    "OS",
    "device",
    "subcontinent",
    "traffic_source",
    "traffic_medium",
    "keyword",
    "referral_path",
]

In [54]:
df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

In [55]:
with open('combined_browser_mappings.json', 'r', encoding='utf-8') as file:
    browser_mappings = json.load(file)
    # browser 칼럼 값을 업데이트합니다. 해당되는 키 값이 없거나 원래 값이 nan이면 변경하지 않습니다.
    def map_browser(value):
        if pd.isna(value):  # 값이 NaN인 경우
            return value  # 원래의 NaN 값을 반환
        return browser_mappings.get(value, value)  # 해당되는 키 값이 없으면 원래 값을 반환

    df_all['browser'] = df_all['browser'].apply(map_browser)

In [56]:
# 'browser' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
value_counts = df_all['browser'].value_counts()  # 값의 출현 빈도를 계산

# 출현 빈도에 따라 정렬된 결과를 출력
for index, count in value_counts.items():
    print(f"{index}: {count}", end=" | ")

Chrome: 231898 | Safari: 49573 | Android Webview: 16077 | Firefox: 11070 | Internet Explorer: 5933 | Edge: 4855 | Opera Mini: 4131 | Samsung Internet: 3959 | Opera: 1768 | UC Browser: 1155 | YaBrowser: 491 | Amazon Silk: 337 | Coc Coc: 273 | other: 182 | Mozilla Compatible Agent: 173 | Android Browser: 135 | Puffin: 65 | 

In [57]:
with open('combined_OS_mappings.json', 'r', encoding='utf-8') as file:
    OS_mappings = json.load(file)
    # OS 칼럼 값을 업데이트합니다. 해당되는 키 값이 없거나 원래 값이 nan이면 변경하지 않습니다.
    def map_OS(value):
        if pd.isna(value):  # 값이 NaN인 경우
            return value  # 원래의 NaN 값을 반환
        return OS_mappings.get(value, value)  # 해당되는 키 값이 없으면 원래 값을 반환

    df_all['OS'] = df_all['OS'].apply(map_OS)

In [58]:
# 'OS' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
value_counts = df_all['OS'].value_counts()  # 값의 출현 빈도를 계산

# 출현 빈도에 따라 정렬된 결과를 출력
for index, count in value_counts.items():
    print(f"{index}: {count}", end=" | ")

Windows: 116502 | Macintosh: 80654 | Android: 66208 | iOS: 39279 | Linux: 12509 | Chrome OS: 12322 | (not set): 3531 | Tizen: 409 | Samsung: 346 | Windows Phone: 168 | game: 57 | OS/2: 45 | BlackBerry: 34 | other: 11 | 

In [59]:
with open('combined_traffic_source_mappings.json', 'r', encoding='utf-8') as file:
    traffic_source_mappings = json.load(file)
    # traffic_source 칼럼 값을 업데이트합니다. 해당되는 키 값이 없거나 원래 값이 nan이면 변경하지 않습니다.
    def map_traffic_source(value):
        if pd.isna(value):  # 값이 NaN인 경우
            return value  # 원래의 NaN 값을 반환
        return traffic_source_mappings.get(value, value)  # 해당되는 키 값이 없으면 원래 값을 반환

    df_all['traffic_source'] = df_all['traffic_source'].apply(map_traffic_source)

In [60]:
# 'traffic_source' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
value_counts = df_all['traffic_source'].value_counts()  # 값의 출현 빈도를 계산

# 출현 빈도에 따라 정렬된 결과를 출력
for index, count in value_counts.items():
    print(f"{index}: {count}", end=" | ")

Search Engines: 157307 | Direct: 78395 | Social Media: 70627 | Google Services: 16133 | Others: 9129 | Blogs: 282 | Educational: 93 | Tech News: 41 | Email Services: 39 | Portals: 16 | Tech Platforms: 13 | 

In [61]:
df_all['traffic_medium'] = df_all['traffic_medium'].replace('(not set)', '(none)')

In [62]:
# 'traffic_medium' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
value_counts = df_all['traffic_medium'].value_counts()  # 값의 출현 빈도를 계산

# 출현 빈도에 따라 정렬된 결과를 출력
for index, count in value_counts.items():
    print(f"{index}: {count}", end=" | ")

organic: 143264 | referral: 89861 | (none): 78401 | cpc: 11776 | affiliate: 8166 | cpm: 607 | 

In [63]:
df_all['keyword'] = df_all['keyword'].str.extract('(Category\d+)')

In [64]:
# 'keyword' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
value_counts = df_all['keyword'].value_counts()  # 값의 출현 빈도를 계산

# 출현 빈도에 따라 정렬된 결과를 출력
for index, count in value_counts.items():
    print(f"{index}: {count}", end=" | ")

Category8: 138182 | Category9: 3817 | Category11: 2456 | Category10: 1487 | Category1: 1446 | Category6: 1139 | Category2: 898 | Category12: 718 | Category4: 485 | Category3: 422 | Category5: 162 | Category7: 118 | 

In [65]:
# 'keyword' 칼럼의 값들의 출현 빈도를 계산
value_counts = df_all['keyword'].value_counts()

# 출현 빈도가 30보다 작은 값들을 필터링
to_replace = value_counts[value_counts < 30].index

# 해당하는 값들을 'other'로 변경
df_all['keyword'] = df_all['keyword'].apply(lambda x: 'other' if x in to_replace else x)

# 변경 후의 'keyword' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
new_value_counts = df_all['keyword'].value_counts()
for index, count in new_value_counts.items():
    print(f"{index}: {count}", end=" | ")

Category8: 138182 | Category9: 3817 | Category11: 2456 | Category10: 1487 | Category1: 1446 | Category6: 1139 | Category2: 898 | Category12: 718 | Category4: 485 | Category3: 422 | Category5: 162 | Category7: 118 | 

In [66]:
df_all['referral_path'] = df_all['referral_path'].str.extract('(Category\d+)')

In [67]:
# 'keyword' 칼럼의 값들의 출현 빈도를 계산
value_counts = df_all['referral_path'].value_counts()

# 출현 빈도가 30보다 작은 값들을 필터링
to_replace = value_counts[value_counts < 30].index

# 해당하는 값들을 'other'로 변경
df_all['referral_path'] = df_all['referral_path'].apply(lambda x: 'other' if x in to_replace else x)

# 변경 후의 'keyword' 칼럼의 고유한 값들과 그 값들의 개수를 출현 빈도에 따라 정렬해서 출력
new_value_counts = df_all['referral_path'].value_counts()
for index, count in new_value_counts.items():
    print(f"{index}: {count}", end=" | ")

Category1: 24737 | Category2: 16015 | Category11: 15325 | Category13: 12618 | Category3: 10695 | Category6: 8561 | Category5: 7044 | Category8: 5987 | Category4: 4797 | Category12: 4671 | Category7: 3783 | Category9: 1798 | Category10: 1046 | 

In [68]:
def one_hot_encoding(df, columns):
    """DataFrame과 칼럼 리스트를 받아 해당 칼럼들에 대해 원-핫 인코딩을 수행하고, 원본 DataFrame에 새로운 칼럼들을 추가합니다."""
    for col in columns:
        # pd.get_dummies를 이용해 원-핫 인코딩 수행
        dummies = pd.get_dummies(df[col], prefix=col)
        # 원본 DataFrame에 인코딩된 DataFrame을 합침
        df = pd.concat([df, dummies], axis=1)
        # 원본 칼럼 삭제
        df.drop(col, axis=1, inplace=True)
    return df

In [69]:
df_all = one_hot_encoding(df_all, label_columns)

In [70]:
# columns_to_drop = ['traffic_source_Others', 'OS_other','browser_other','traffic_medium_(none)','OS_(not set)','subcontinent_(not set)']

# # 학습 데이터에서 필요없는 칼럼 제거
# df_all = df_all.drop(columns=columns_to_drop, axis=1)


In [71]:
df_train_dropped = df_train.drop(columns=label_columns)
df_test_dropped = df_test.drop(columns=label_columns)

df_train_encoded = df_all.iloc[:len(df_train)].copy()
df_test_encoded = df_all.iloc[len(df_train):].copy()


df_train = pd.concat([df_train_dropped, df_train_encoded], axis=1)
df_test = pd.concat([df_test_dropped, df_test_encoded], axis=1)

In [72]:
# train 데이터셋에서 결측값 확인
train_missing_values = df_train.isnull().sum()
print("Train 데이터셋의 결측값:")
print(train_missing_values[train_missing_values > 0])  # 결측값이 있는 열만 출력

Train 데이터셋의 결측값:
Series([], dtype: int64)


In [73]:
# train['keyword'].fillna(train['keyword'].mode()[0], inplace=True)
# train['referral_path'].fillna(train['referral_path'].mode()[0], inplace=True)

In [74]:
# test['keyword'].fillna(train['keyword'].mode()[0], inplace=True) #train의 최빈값 사용
# test['referral_path'].fillna(train['referral_path'].mode()[0], inplace=True)

In [75]:
# categorical_features = list(train.dtypes[train.dtypes == "object"].index)

# for i in categorical_features:
#     count = train[i].nunique()  # 고유값의 개수를 계산
#     print(f"{i}: {count}")

In [76]:
# train = train.drop(columns=['sessionID','userID'],axis=1)
# test = test.drop(columns=['sessionID','userID'],axis=1)

In [77]:
# encoding_target = list(train.dtypes[train.dtypes == "object"].index)

# for i in encoding_target:
#     le = LabelEncoder()
#     le.fit(train[i])
#     train[i] = le.transform(train[i])
    
#     # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
#     # test 데이터에 대해서 직접적으로 fit을 수행할 경우 Data Leakage
#     for case in np.unique(test[i]):
#         if case not in le.classes_: 
#             le.classes_ = np.append(le.classes_, case) 
    
#     test[i] = le.transform(test[i])

In [78]:
x_train = df_train.drop(columns=['TARGET'])
y_train = df_train['TARGET']

x_test = df_test

In [79]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout
from tensorflow_addons.optimizers import AdamW
import tensorflow as tf

# 스케일러 정의
scaler = StandardScaler()

# 학습 데이터에 대한 스케일러 fitting 및 변환
x_train = scaler.fit_transform(x_train)

model = Sequential([
    Dense(256, input_shape=(x_train.shape[1],)),
    BatchNormalization(),
    Activation(tf.nn.gelu),
    Dropout(0.3),
    
    Dense(128),
    BatchNormalization(),
    Activation(tf.nn.gelu),
    Dropout(0.3),
    
    Dense(128), # 추가된 층
    BatchNormalization(),
    Activation(tf.nn.gelu),
    Dropout(0.3),
    
    Dense(64),
    BatchNormalization(),
    Activation(tf.nn.gelu),
    Dropout(0.3),
    
    Dense(64), # 추가된 층
    BatchNormalization(),
    Activation(tf.nn.gelu),
    Dropout(0.3),
    
    Dense(32),
    BatchNormalization(),
    Activation(tf.nn.gelu),
    Dropout(0.3),
    
    # 출력층 변경: 회귀 문제에 적합하게 활성화 함수를 linear로 설정 (혹은 생략 가능)
    Dense(1, activation='linear')
])

# 모델 컴파일 부분 변경: 손실 함수를 'mean_squared_error'로 변경
adamw_optimizer = AdamW(learning_rate=0.001, weight_decay=1e-4)
model.compile(optimizer=adamw_optimizer, loss='mean_squared_error', metrics=['mae'])  # 회귀 문제에서는 정확도 대신 MAE 등을 사용할 수 있음


In [80]:
# # 모델 훈련
# model.fit(x_train, y_train, epochs=10, batch_size=10)

# # 모델 저장
# model.save('./bigdata/model/model_noother_path.h5')

In [81]:
from tensorflow.keras.models import load_model
model1 = load_model('./bigdata/model/model_noother_path.h5')

In [82]:
x_test = scaler.transform(x_test)

In [83]:
preds1 = model1.predict(x_test)



In [84]:
submission = pd.read_csv('./bigdata/sample_submission.csv')
submission

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,0
1,SESSION_252290,0
2,SESSION_252291,0
3,SESSION_252292,0
4,SESSION_252293,0
...,...,...
79781,SESSION_332070,0
79782,SESSION_332071,0
79783,SESSION_332072,0
79784,SESSION_332073,0


In [85]:
submission['TARGET'] = preds1
submission

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,15.222219
1,SESSION_252290,1.024654
2,SESSION_252291,2.800792
3,SESSION_252292,4.158432
4,SESSION_252293,14.618855
...,...,...
79781,SESSION_332070,2.161209
79782,SESSION_332071,1.309649
79783,SESSION_332072,3.182707
79784,SESSION_332073,3.838030


In [86]:
submission.to_csv('./deeplearning_model_baseline_submission.csv', index=False)

In [87]:
#######


y_preds_sum = np.zeros(len(x_test))


In [44]:
from sklearn.ensemble import RandomForestRegressor

# 데이터 준비
# X, y를 준비해야 합니다.
# 예를 들어, X는 특성 행렬이고, y는 타겟 벡터입니다.

# 모델 생성 및 학습
random_forest_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_regressor.fit(x_train, y_train)

# 예측 및 평가
y_pred1= random_forest_regressor.predict(x_test)
submission['TARGET'] = y_pred1
submission
submission.to_csv('./result/submission_RandomForestRegressor.csv', index=False)

y_preds_sum += y_pred1

In [88]:
from catboost import CatBoostRegressor, Pool

# 데이터 준비
# x_train, y_train, x_test는 이미 준비되어 있다고 가정합니다.
# submission은 제출 파일을 준비하기 위한 빈 DataFrame입니다.

# CatBoostRegressor 모델 정의
catboost_regressor = CatBoostRegressor(
     iterations=10000, 
     learning_rate=0.01, 
     depth=10,
     loss_function='RMSE',
     random_seed=42,
     verbose=1000,  # 학습 과정에서 100번째 마다의 진행 상황을 출력합니다. 
)

# 모델 학습
catboost_regressor.fit(x_train, y_train)

# 테스트 데이터에 대한 예측
y_pred_catboost = catboost_regressor.predict(x_test)

# 예측 결과를 submission DataFrame에 추가
submission['TARGET'] = y_pred_catboost

# 결과를 CSV 파일로 저장
submission.to_csv('./result/submission_CatBoostRegressor.csv', index=False)


0:	learn: 5.4715805	total: 742ms	remaining: 2h 3m 37s
1000:	learn: 2.1970604	total: 10m 9s	remaining: 1h 31m 15s
2000:	learn: 2.0094824	total: 20m 18s	remaining: 1h 21m 11s
3000:	learn: 1.8785699	total: 30m 35s	remaining: 1h 11m 20s
4000:	learn: 1.7739323	total: 40m 48s	remaining: 1h 1m 11s
5000:	learn: 1.6940202	total: 50m 48s	remaining: 50m 47s
6000:	learn: 1.6278634	total: 1h 1m 9s	remaining: 40m 45s
7000:	learn: 1.5752925	total: 1h 11m 30s	remaining: 30m 37s
8000:	learn: 1.5292994	total: 1h 21m 46s	remaining: 20m 25s
9000:	learn: 1.4868566	total: 1h 32m 5s	remaining: 10m 13s
9999:	learn: 1.4472453	total: 1h 42m 38s	remaining: 0us


In [45]:
# from sklearn.linear_model import LinearRegression

# linear_regressor = LinearRegression()
# linear_regressor.fit(x_train, y_train)

# y_pred2 = linear_regressor.predict(x_test)
# submission['TARGET'] = y_pred2
# submission.to_csv('./result/submission_LinearRegression.csv', index=False)

# y_preds_sum += y_pred2

In [46]:
# from sklearn.linear_model import Ridge

# ridge_regressor = Ridge(alpha=1.0)
# ridge_regressor.fit(x_train, y_train)

# y_pred3 = ridge_regressor.predict(x_test)
# submission['TARGET'] = y_pred3
# submission.to_csv('./result/submission_Ridge.csv', index=False)

# y_preds_sum += y_pred3

In [47]:
# from sklearn.linear_model import Lasso

# lasso_regressor = Lasso(alpha=1.0)
# lasso_regressor.fit(x_train, y_train)

# y_pred4 = lasso_regressor.predict(x_test)
# submission['TARGET'] = y_pred4
# submission.to_csv('./result/submission_Lasso.csv', index=False)

# y_preds_sum += y_pred4

In [48]:
# from sklearn.linear_model import ElasticNet

# elastic_net_regressor = ElasticNet(alpha=1.0, l1_ratio=0.5)
# elastic_net_regressor.fit(x_train, y_train)

# y_pred5 = elastic_net_regressor.predict(x_test)
# submission['TARGET'] = y_pred5
# submission.to_csv('./result/submission_ElasticNet.csv', index=False)

# y_preds_sum += y_pred5

In [49]:
# from sklearn.svm import SVR

# svr_regressor = SVR(kernel='rbf')
# svr_regressor.fit(x_train, y_train)

# y_pred6 = svr_regressor.predict(x_test)
# submission['TARGET'] = y_pred6
# submission.to_csv('./result/submission_SVR.csv', index=False)

# y_preds_sum += y_pred6

In [50]:
# from sklearn.tree import DecisionTreeRegressor

# decision_tree_regressor = DecisionTreeRegressor(random_state=42)
# decision_tree_regressor.fit(x_train, y_train)

# y_pred7 = decision_tree_regressor.predict(x_test)
# submission['TARGET'] = y_pred7
# submission.to_csv('./result/submission_DecisionTreeRegressor.csv', index=False)

# y_preds_sum += y_pred7

In [51]:
# from sklearn.ensemble import GradientBoostingRegressor

# gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
# gradient_boosting_regressor.fit(x_train, y_train)

# y_pred8 = gradient_boosting_regressor.predict(x_test)
# submission['TARGET'] = y_pred8

# submission.to_csv('./result/submission_GradientBoostingRegressor.csv', index=False)

# y_preds_sum += y_pred8

In [52]:
# from sklearn.ensemble import AdaBoostRegressor

# ada_boost_regressor = AdaBoostRegressor(n_estimators=100, random_state=42)
# ada_boost_regressor.fit(x_train, y_train)

# y_pred9 = ada_boost_regressor.predict(x_test)
# submission['TARGET'] = y_pred9

# submission.to_csv('./result/submission_AdaBoostRegressor.csv', index=False)

# y_preds_sum += y_pred9

# this result is shit

In [53]:
y_pred_final = y_preds_sum / 8
y_pred_final

array([16.61997366,  0.90968313,  2.44194314, ...,  2.59995153,
        3.41762088,  1.08857093])

In [54]:
submission['TARGET'] = y_pred_final
submission.to_csv('./result/submission_ensemble.csv', index=False)

In [55]:
for i in df_train.columns:
    print(i,end=",")

TARGET,new,quality,duration,bounced,transaction,transaction_revenue,browser_Amazon Silk,browser_Android Browser,browser_Android Webview,browser_Chrome,browser_Coc Coc,browser_Edge,browser_Firefox,browser_Internet Explorer,browser_Mozilla Compatible Agent,browser_Opera,browser_Opera Mini,browser_Puffin,browser_Safari,browser_Samsung Internet,browser_UC Browser,browser_YaBrowser,browser_other,OS_(not set),OS_Android,OS_BlackBerry,OS_Chrome OS,OS_Linux,OS_Macintosh,OS_OS/2,OS_Samsung,OS_Tizen,OS_Windows,OS_Windows Phone,OS_game,OS_iOS,OS_other,device_desktop,device_mobile,device_tablet,subcontinent_(not set),subcontinent_Australasia,subcontinent_Caribbean,subcontinent_Central America,subcontinent_Central Asia,subcontinent_Eastern Africa,subcontinent_Eastern Asia,subcontinent_Eastern Europe,subcontinent_Melanesia,subcontinent_Micronesian Region,subcontinent_Middle Africa,subcontinent_Northern Africa,subcontinent_Northern America,subcontinent_Northern Europe,subcontinent_Polynesia,subcontin

In [56]:
from sklearn.metrics import mean_absolute_error

# 예측값 배열의 리스트
pred_arrays = [
    y_pred2,0
    y_pred3,1
    y_pred4,2
    y_pred5,3
    y_pred7,4
    y_pred8,5
    y_pred9,6
    # 여기에 다른 배열들 추가...
]

# 실제 값
y_test = y_pred1  # 이 예제에서 y_test로 사용할 실제값 배열

# 각 예측값 배열에 대해 MAE를 계산하고 저장
mae_scores = []

for i, preds in enumerate(pred_arrays):
    mae = mean_absolute_error(y_test, preds)
    mae_scores.append((i, mae))

# MAE 점수가 높은 순서대로 정렬
mae_scores_sorted = sorted(mae_scores, key=lambda x: x[1], reverse=True)

# 정렬된 결과를 출력
print("성능이 안 좋은 순서대로 배열의 인덱스와 MAE:")
for index, mae in mae_scores_sorted:
    print(f"인덱스: {index}, MAE: {mae}")


SyntaxError: invalid syntax (669105451.py, line 6)