In [42]:
!pip install catboost



In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [45]:
from google.colab import drive
drive.mount('/content/drive')
train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
drop_col = ['customer_country.1', 'customer_country', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
df_train.drop(drop_col, axis=1, inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)

In [47]:
'''
# 수치형 변수 전체 Standardizaiton, Normalization 실행
'''
# Initialize scalers
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
columns_to_ST = ["com_reg_ver_win_rate", "historical_existing_cnt","lead_desc_length"]
columns_to_NM = ["ver_win_rate_x", "ver_win_ratio_per_bu"]

# Apply Standardization to each column
for column in columns_to_ST:
    # Standardization
    df_train[column + '_standardized'] = scaler_standard.fit_transform(df_train[[column]])
    df_test[column + '_standardized'] = scaler_standard.fit_transform(df_test[[column]])
    # Drop the original column
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

# Apply Standardization to each column
for column in columns_to_NM:
    # Normalization
    df_train[column + '_normalized'] = scaler_minmax.fit_transform(df_train[[column]])
    df_test[column + '_normalized'] = scaler_minmax.fit_transform(df_test[[column]])
    # Drop the original column
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

In [48]:
'''
# business_area 날리고 2개 피처 추가
'''
def is_hospital(value):
    if value == "hospital & health care":
        return 1
    else:
        return 0

def is_power(value):
    if value == "power plant / renewable energy":
        return 1
    else:
        return 0

# "business_area" 열에서 함수를 적용하여 새로운 열 생성
df_train['is_hospital'] = df_train['business_area'].apply(is_hospital)
df_train['is_power'] = df_train['business_area'].apply(is_hospital)
df_test['is_hospital'] = df_test['business_area'].apply(is_hospital)
df_test['is_power'] = df_test['business_area'].apply(is_hospital)

#df_train.drop(columns=["business_area"], inplace=True)
#df_test.drop(columns=["business_area"], inplace=True)

In [49]:
'''
# expected_timeline 에서 특정 단어 필터링
'''
# 새로운 열을 추가하기 위한 함수 정의
def contains_budget(value):
    if 'budget' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_budget'] = df_train['expected_timeline'].apply(contains_budget)
df_test['contains_budget'] = df_test['expected_timeline'].apply(contains_budget)

# 새로운 열을 추가하기 위한 함수 정의
def contains_etc(value):
    if 'etc' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_etc'] = df_train['expected_timeline'].apply(contains_etc)
df_test['contains_etc'] = df_test['expected_timeline'].apply(contains_etc)

# 새로운 열을 추가하기 위한 함수 정의
def contains_hence(value):
    if 'hence' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_hence'] = df_train['expected_timeline'].apply(contains_hence)
df_test['contains_hence'] = df_test['expected_timeline'].apply(contains_hence)

# 새로운 열을 추가하기 위한 함수 정의
def contains_although(value):
    if 'although' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_although'] = df_train['expected_timeline'].apply(contains_although)
df_test['contains_although'] = df_test['expected_timeline'].apply(contains_although)

# 새로운 열을 추가하기 위한 함수 정의
def contains_more(value):
    if 'more' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_more'] = df_train['expected_timeline'].apply(contains_more)
df_test['contains_more'] = df_test['expected_timeline'].apply(contains_more)

# 새로운 열을 추가하기 위한 함수 정의
def contains_year(value):
    if 'year' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_year'] = df_train['expected_timeline'].apply(contains_year)
df_test['contains_year'] = df_test['expected_timeline'].apply(contains_year)

In [63]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

# 수치형 변수 선택
numerical_columns = ['com_reg_ver_win_rate_standardized', 'historical_existing_cnt_standardized',
                     'lead_desc_length_standardized', 'ver_win_rate_x_normalized', 'ver_win_ratio_per_bu_normalized']

# 결측치가 없는 열들로만 이루어진 데이터프레임 생성
df_non_missing = df_train[numerical_columns].dropna()

# 수치형 변수 결측치 채우기
for numerical_column in numerical_columns:
    df_with_missing_numerical = df_train[df_train[numerical_column].isnull()]

    # 결측치인 행 제거
    df_with_missing_numerical = df_with_missing_numerical.dropna(subset=numerical_columns)

    if not df_with_missing_numerical.empty:
        X_train_numerical, X_test_numerical, y_train_numerical, y_test_numerical = train_test_split(
            df_non_missing[numerical_columns],
            df_non_missing[numerical_column],
            test_size=0.2, random_state=42
        )
        model = HistGradientBoostingRegressor()
        model.fit(X_train_numerical, y_train_numerical)
        predicted_values_numerical = model.predict(df_with_missing_numerical[numerical_columns])
        df_train.loc[df_train[numerical_column].isnull(), numerical_column] = predicted_values_numerical

# df_test에 대한 수치형 변수 결측치 채우기
for numerical_column in numerical_columns:
    df_with_missing_numerical_test = df_test[df_test[numerical_column].isnull()]

    # 결측치인 행 제거
    df_with_missing_numerical_test = df_with_missing_numerical_test.dropna(subset=numerical_columns)

    if not df_with_missing_numerical_test.empty:
        predicted_values_numerical_test = model.predict(df_with_missing_numerical_test[numerical_columns])
        df_test.loc[df_test[numerical_column].isnull(), numerical_column] = predicted_values_numerical_test

# 결과 출력
print(df_train)
print(df_test)


       bant_submit business_unit  customer_idx          customer_type  \
0             1.00            AS         32160           End-Customer   
1             1.00            AS         23122           End-Customer   
2             1.00            AS          1755           End-Customer   
3             1.00            AS          4919           End-Customer   
4             1.00            AS         17126  Specifier/ Influencer   
...            ...           ...           ...                    ...   
59294         1.00            AS         33747           End Customer   
59295         0.75            AS         35420  Specifier/ Influencer   
59296         0.75            AS         19249  Specifier/ Influencer   
59297         1.00            AS         40327                    NaN   
59298         0.75      Solution         30268                    NaN   

       enterprise             customer_job  \
0      Enterprise               purchasing   
1      Enterprise  media and co

In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer

# 범주형 변수 선택
categorical_column = 'business_subarea'

# 결측치가 없는 열들로만 이루어진 데이터프레임 생성
df_non_missing = df_train[[categorical_column]].dropna()

# CatBoost 모델 학습을 위한 데이터 준비
X_train_categorical, X_test_categorical, y_train_categorical, y_test_categorical = train_test_split(
    df_non_missing,
    df_non_missing[categorical_column],
    test_size=0.2, random_state=42
)

# CatBoost 모델 정의
catboost_model = CatBoostClassifier(
    verbose=0,
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='Logloss',
    one_hot_max_size=5,
    cat_features=[0]  # 이 부분이 추가되었습니다. 범주형 변수의 인덱스를 지정합니다.
)

# CatBoost 모델 학습
catboost_model.fit(X_train_categorical, y_train_categorical)

# 결측치가 있는 행 선택
df_with_missing_categorical = df_train[df_train[categorical_column].isnull()]

# CatBoost 모델을 사용하여 결측치 채우기
predicted_values_categorical = catboost_model.predict(df_with_missing_categorical[[categorical_column]])
df_train.loc[df_train[categorical_column].isnull(), categorical_column] = predicted_values_categorical

# 결과 출력
print(df_train)


CatBoostError: /src/catboost/catboost/private/libs/target/target_converter.cpp:379: Target with classes must contain only 2 unique values for binary classification