In [71]:
from sklearn.linear_model import LogisticRegression,RidgeClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance

import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import plot_importance

import catboost
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error  
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [165]:
train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [166]:
train.shape, test.shape

((59299, 40), (5271, 40))

# 결측치 채우기 (ver_win_rate_x, ver_win_ratio_per_bu, com_reg_ver_win_rate)

In [8]:
def encode_categorical_variables(train, test):                                        # 인코더 정의
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        
        train_unique_labels = train[col].astype(str).unique()
        le.fit(train_unique_labels)
        
        test_unique_labels = test[col].astype(str).unique()
        unseen_labels = set(test_unique_labels) - set(train_unique_labels)
        
        if unseen_labels:
            le_classes = le.classes_.tolist()  
            le_classes.append('other') 
            le.classes_ = np.array(le_classes)  
            test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
        else:

            test[col] = test[col].astype(str)
        
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col])
        
        label_encoders[col] = le
    
    return train, test

## 1. train data의 ver_win_rate_x 분류

In [11]:
#예측에 사용할 변수 선택
train_rate = train.drop(columns=['ver_win_ratio_per_bu', 'com_reg_ver_win_rate'])

In [12]:
#결측을 기준으로 train test 분리
test = train_rate[train_rate['ver_win_rate_x'].isnull()]
train = train_rate[~train_rate['ver_win_rate_x'].isnull()]

In [13]:
train.shape, test.shape

((18417, 38), (40882, 38))

In [14]:
train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #분류일때 실행
test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #분류일때 실행
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)


In [15]:
#인코딩 
train, test = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be s

In [16]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_rate_x', axis=1)
train_y = train['ver_win_rate_x']

test_x = test.drop(columns='ver_win_rate_x', axis=1)   

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [18]:
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_valid)
f1_score(y_valid, pred, average='macro')

1.0

In [19]:
test_pred = model.predict(test_x)
train_rate.loc[test.index, 'ver_win_rate_x'] = test_pred        #예측벡터 입력

In [20]:
train_rate.to_csv('train_rate_x.csv', index=False) 

In [138]:
# train = pd.read_csv('train_rate_x.csv')
# train["ver_win_rate_x"].isnull().sum()

0

## 2. test data의 ver_win_rate_x 분류

In [21]:
train = pd.read_csv('train_rate_x.csv')
test = pd.read_csv('test_preprocessed.csv')

#예측에 사용할 변수 선택
train_rate = train.drop(columns=["is_converted"])
test_org = test.drop(columns=['ver_win_ratio_per_bu', 'com_reg_ver_win_rate',"is_converted"])

In [22]:
train_rate.shape, test_org.shape

((59299, 37), (5271, 37))

In [23]:
#결측을 기준으로 train test 분리
train = train_rate
test = test_org[test_org['ver_win_rate_x'].isnull()]       #주의!

In [24]:
train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #분류일때 실행
test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)


In [25]:
#인코딩 
train, test = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [26]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_rate_x', axis=1)
train_y = train['ver_win_rate_x']

test_x = test.drop(columns='ver_win_rate_x', axis=1)   

X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [27]:
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_valid)
f1_score(y_valid, pred, average='macro')

1.0

In [28]:
test_pred = model.predict(test_x)
test_org.loc[test.index, 'ver_win_rate_x'] = test_pred        #예측벡터 입력

In [29]:
test_org.to_csv('test_rate_x.csv', index=False) 

## 1. train data의 ver_win_ratio_per_bu 분류

In [30]:
train_bu = pd.read_csv('train_rate_x.csv')   
a = pd.read_csv('train.csv')
train_bu["ver_win_ratio_per_bu"]=a["ver_win_ratio_per_bu"]    #ver_win_ratio_per_bu복구하기

In [31]:
#결측을 기준으로 train test 분리
test = train_bu[train_bu['ver_win_ratio_per_bu'].isnull()]
train = train_bu[~train_bu['ver_win_ratio_per_bu'].isnull()]
train.shape, test.shape

((15304, 39), (43995, 39))

In [32]:
train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  #분류일때 실행
test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  #분류일때 실행
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)


In [33]:
#인코딩 
train, test = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be s

In [34]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_ratio_per_bu', axis=1)
train_y = train['ver_win_ratio_per_bu']

test_x = test.drop(columns='ver_win_ratio_per_bu', axis=1)  

X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [35]:
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_valid)
f1_score(y_valid, pred, average='macro')

1.0

In [36]:
test_pred = model.predict(test_x)
train_bu.loc[test.index, 'ver_win_ratio_per_bu'] = test_pred        #예측벡터 입력

In [37]:
train_bu.to_csv('train_bu.csv', index=False) 

## 2. test data의 ver_win_ratio_per_bu 분류

In [118]:
train_bu = pd.read_csv('train_bu.csv')
test_bu = pd.read_csv('test_rate_x.csv')  

In [119]:
b = pd.read_csv('submission.csv')
test_bu["ver_win_ratio_per_bu"]=b["ver_win_ratio_per_bu"]    #ver_win_ratio_per_bu복구 

#예측에 사용할 변수 선택
train_bu = train_bu.drop(columns=["is_converted"])
test_bu = test_bu

In [120]:
#결측을 기준으로 train test 분리
train = train_bu
test = test_bu[test_bu['ver_win_ratio_per_bu'].isnull()]       #주의!

In [121]:
train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  #분류일때 실행
test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)


In [122]:
#인코딩 
train, test = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [123]:

def oversample_train_data(train):                                                    #오버샘플링 (4가지 중 한개) 비추천!
    X = train.drop('ver_win_ratio_per_bu', axis=1)
    y = train['ver_win_ratio_per_bu']

    # 오버샘플링 방법 선택
    # smote = SMOTE()
    # X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # adasyn = ADASYN()
    # X_resampled, y_resampled = adasyn.fit_resample(X, y)

    # borderline_smote = BorderlineSMOTE()
    # X_resampled, y_resampled = borderline_smote.fit_resample(X, y)
    
    ros = RandomOverSampler()
    X_resampled, y_resampled = ros.fit_resample(X, y)

    train_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    train_resampled['ver_win_ratio_per_bu'] = y_resampled

    return train_resampled


train = oversample_train_data(train)

In [124]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_ratio_per_bu', axis=1)
train_y = train['ver_win_ratio_per_bu']

test_x = test.drop(columns='ver_win_ratio_per_bu', axis=1)   

train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=24, shuffle=True
                                                      #stratify=train_y
                                                     )  

In [107]:
lgbm_clf = LGBMClassifier(n_estimators=500, objective='multiclass',n_jobs= -1)               # macro f1 

# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
lgbm_clf.fit(train_x, train_y, early_stopping_rounds=50, eval_metric="multi_logloss", eval_set=eval_set)

# F1 Score 계산
f1 = f1_score(valid_y, lgbm_clf.predict(valid_x), average='macro')
print('Macro F1 Score: {0:.4f}'.format(f1))



[1]	training's multi_logloss: 0.858877	valid_1's multi_logloss: 0.858808
[2]	training's multi_logloss: 0.681283	valid_1's multi_logloss: 0.681224
[3]	training's multi_logloss: 0.548615	valid_1's multi_logloss: 0.548572
[4]	training's multi_logloss: 0.444913	valid_1's multi_logloss: 0.444879
[5]	training's multi_logloss: 0.362465	valid_1's multi_logloss: 0.362437
[6]	training's multi_logloss: 0.296211	valid_1's multi_logloss: 0.296189
[7]	training's multi_logloss: 0.242586	valid_1's multi_logloss: 0.242568
[8]	training's multi_logloss: 0.198971	valid_1's multi_logloss: 0.198956
[9]	training's multi_logloss: 0.163377	valid_1's multi_logloss: 0.163365
[10]	training's multi_logloss: 0.134261	valid_1's multi_logloss: 0.134251
[11]	training's multi_logloss: 0.110401	valid_1's multi_logloss: 0.110392
[12]	training's multi_logloss: 0.0908227	valid_1's multi_logloss: 0.0908158
[13]	training's multi_logloss: 0.0747417	valid_1's multi_logloss: 0.074736
[14]	training's multi_logloss: 0.0615247	val

In [126]:
test_pred = lgbm_clf.predict(test_x)
test_bu.loc[test.index, 'ver_win_ratio_per_bu'] = test_pred        #예측벡터 입력

In [127]:
test_bu.to_csv('test_bu.csv', index=False) 

## 1. train data의 com_reg_ver_win_rate 회귀

In [145]:
train_rate2 = pd.read_csv('train_bu.csv')
c = pd.read_csv('train.csv')
train_rate2["com_reg_ver_win_rate"]=c["com_reg_ver_win_rate"]    #com_reg_ver_win_rate복구하기

In [146]:
#결측을 기준으로 train test 분리
test = train_rate2[train_rate2['com_reg_ver_win_rate'].isnull()]
train = train_rate2[~train_rate2['com_reg_ver_win_rate'].isnull()]
train.shape, test.shape

((14568, 40), (44731, 40))

In [147]:
#인코딩 
train, test = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be s

In [148]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='com_reg_ver_win_rate', axis=1)
train_y = train['com_reg_ver_win_rate']

test_x = test.drop(columns='com_reg_ver_win_rate', axis=1)  

train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [149]:
lgbm_regressor = LGBMRegressor(n_estimators=500, objective='regression', n_jobs=-1)  # 회귀 모델 설정

# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
lgbm_regressor.fit(train_x, train_y, early_stopping_rounds=50, eval_metric="rmse", eval_set=eval_set, verbose=True)

# RMSE 계산
preds = lgbm_regressor.predict(valid_x)
mse = mean_squared_error(valid_y, preds)
rmse = np.sqrt(mse)  
print('RMSE: {0:.4f}'.format(rmse))



[1]	training's rmse: 0.136069	training's l2: 0.0185148	valid_1's rmse: 0.138476	valid_1's l2: 0.0191755
[2]	training's rmse: 0.12314	training's l2: 0.0151635	valid_1's rmse: 0.125512	valid_1's l2: 0.0157532
[3]	training's rmse: 0.111545	training's l2: 0.0124423	valid_1's rmse: 0.113927	valid_1's l2: 0.0129794
[4]	training's rmse: 0.101193	training's l2: 0.01024	valid_1's rmse: 0.10361	valid_1's l2: 0.010735
[5]	training's rmse: 0.0918661	training's l2: 0.00843939	valid_1's rmse: 0.094297	valid_1's l2: 0.00889192
[6]	training's rmse: 0.0835959	training's l2: 0.00698827	valid_1's rmse: 0.0861156	valid_1's l2: 0.0074159
[7]	training's rmse: 0.0761529	training's l2: 0.00579927	valid_1's rmse: 0.0787667	valid_1's l2: 0.00620419
[8]	training's rmse: 0.0695575	training's l2: 0.00483825	valid_1's rmse: 0.0722258	valid_1's l2: 0.00521657
[9]	training's rmse: 0.0636456	training's l2: 0.00405076	valid_1's rmse: 0.0664934	valid_1's l2: 0.00442137
[10]	training's rmse: 0.0584172	training's l2: 0.00

In [150]:
test_pred = lgbm_regressor.predict(test_x)
train_rate2.loc[test.index, 'com_reg_ver_win_rate'] = test_pred        #예측벡터 입력

In [151]:
train_rate2.to_csv('train_rate2.csv', index=False) 

## 2. test data의 com_reg_ver_win_rate 회귀

In [152]:
train_rate2 = pd.read_csv('train_rate2.csv')
test_rate2 = pd.read_csv('test_bu.csv')  


In [156]:
d = pd.read_csv('submission.csv')
test_rate2["com_reg_ver_win_rate"]=d["com_reg_ver_win_rate"]    #com_reg_ver_win_rate 복구 

#예측에 사용할 변수 선택
train_rate2 = train_rate2.drop(columns=["is_converted"])
test_rate2 = test_rate2

In [158]:
#결측을 기준으로 train test 분리
train = train_rate2
test = test_rate2[test_rate2['com_reg_ver_win_rate'].isnull()]       #주의!

In [160]:
#인코딩 
train, test = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [161]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='com_reg_ver_win_rate', axis=1)
train_y = train['com_reg_ver_win_rate']

test_x = test.drop(columns='com_reg_ver_win_rate', axis=1)  

train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [162]:
lgbm_regressor = LGBMRegressor(n_estimators=500, objective='regression', n_jobs=-1)  # 회귀 모델 설정

# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
lgbm_regressor.fit(train_x, train_y, early_stopping_rounds=50, eval_metric="rmse", eval_set=eval_set, verbose=True)

# RMSE 계산
preds = lgbm_regressor.predict(valid_x)
mse = mean_squared_error(valid_y, preds)
rmse = np.sqrt(mse)  
print('RMSE: {0:.4f}'.format(rmse))



[1]	training's rmse: 0.0771999	training's l2: 0.00595983	valid_1's rmse: 0.079903	valid_1's l2: 0.00638449
[2]	training's rmse: 0.0705632	training's l2: 0.00497916	valid_1's rmse: 0.0730119	valid_1's l2: 0.00533074
[3]	training's rmse: 0.064657	training's l2: 0.00418052	valid_1's rmse: 0.0669079	valid_1's l2: 0.00447666
[4]	training's rmse: 0.0594231	training's l2: 0.00353111	valid_1's rmse: 0.0614657	valid_1's l2: 0.00377803
[5]	training's rmse: 0.054738	training's l2: 0.00299625	valid_1's rmse: 0.0565908	valid_1's l2: 0.00320252
[6]	training's rmse: 0.0506008	training's l2: 0.00256044	valid_1's rmse: 0.05225	valid_1's l2: 0.00273006
[7]	training's rmse: 0.0467936	training's l2: 0.00218964	valid_1's rmse: 0.0481528	valid_1's l2: 0.00231869
[8]	training's rmse: 0.0434534	training's l2: 0.0018882	valid_1's rmse: 0.0446012	valid_1's l2: 0.00198927
[9]	training's rmse: 0.0404799	training's l2: 0.00163863	valid_1's rmse: 0.0414159	valid_1's l2: 0.00171528
[10]	training's rmse: 0.0376991	tr

In [163]:
test_pred = lgbm_regressor.predict(test_x)
test_rate2.loc[test.index, 'com_reg_ver_win_rate'] = test_pred        #예측벡터 입력

In [164]:
test_rate2.to_csv('test_rate2.csv', index=False) 

# 결측치 채우기 완료

In [175]:
train = pd.read_csv('train_rate2.csv')
test = pd.read_csv('test_rate2.csv')

e = pd.read_csv('submission.csv')
test["is_converted"]=e["is_converted"]           #drop 했던 is_converted 복구

train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #이 두 변수는 범주형으로 취금하기위해 object로 변환
test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)

train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  
test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)

In [176]:
train.shape, test.shape

((59299, 40), (5271, 40))

# 특정 범주 파생변수 생성

In [179]:
base= ['inquiry_type','inquiry_type','inquiry_type','inquiry_type',
       'bant_submit',   
       'product_category', 'product_category', 'product_category', 'product_category',
       'product_category','product_category','product_category',
       'expected_timeline',     
       'customer_continent',    
       'customer_country', 'customer_country',
       'customer_idx', 'customer_idx', 'customer_idx', 'customer_idx',
       'customer_idx', 'customer_idx', 'customer_idx', 'customer_idx',
       'customer_idx', 'customer_idx', 'customer_idx',
       'customer_position', 'customer_position', 'customer_position',
       'lead_owner', 'lead_owner', 'lead_owner', 'lead_owner',
       'lead_owner', 'lead_owner', 'lead_owner', 'lead_owner',
       'lead_owner', 'lead_owner', 'lead_owner', 'lead_owner',
       'lead_owner', 'lead_owner'
      ]

new= [ 'is_usage','is_product_info','is_sercvices','is_demo',
       'is_bant_0.5',   
       'ver_video', 'ver_signage', 'ver_led','ver_hotel_tv',
       'ver_inter_board','ver_one_quick','ver_sinage',
       'is_time_year',     
       'is_Asia',  
       'is_hongkong', 'is_us',
       'is_47466', 'is_37680', 'is_18030', 'is_21321',
       'is_33350', 'is_25309', 'is_32240', 'is_31864',
       'is_19804', 'is_40344', 'is_7195',
       'is_intern', 'is_entry_level', 'is_trainee',
       'isow_97', 'isow_437', 'isow_487', 'isow_831',
       'isow_480', 'isow_375', 'isow_589', 'isow_166',
       'isow_279', 'isow_833', 'isow_4', 'isow_570',
       'isow_147', 'isow_148'
      ]

category= ['usage or technical consultation','Product Information','Services','Request a Demo',
        0.5,   
       'video wall signage', 'interactive signage', 'led signage','hotel tv',
       'interactive digital board', 'one:quick series','sinage', 
       'more than a year',
       'Asia',
       'Hong Kong', 'United States',
        47466, 37680, 18030, 21321,
        33350, 25309, 32240, 31864,
        19804, 40344, 7195,
       'intern', 'entry level',"trainee",
        97, 437, 487, 831,
        480, 375, 589, 166,
        279, 833, 4, 570,
        147, 148
      ]

def add_multiple_category_flags(train, test, base_variables, new_variables, category_names):
    
    if not (len(base_variables) == len(new_variables) == len(category_names)):
        raise ValueError("The length of base_variables, new_variables, and category_names must be the same.")
    for base_var, new_var, cat_name in zip(base_variables, new_variables, category_names):
        train[new_var] = (train[base_var] == cat_name).astype('int8')
        test[new_var] = (test[base_var] == cat_name).astype('int8')
    return train, test

train, test = add_multiple_category_flags(train, test, base, new ,category)
# sums = train[new].sum()    #파생변수가 유의미한지 확인
# sums

In [180]:
train.shape, test.shape

((59299, 84), (5271, 84))

In [182]:
# train.to_csv('train_preprocessed_2.csv', index=False) 
# test.to_csv('test_preprocessed_2.csv', index=False) 