In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [2]:
!pip install catboost sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [32]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
import pandas as pd
import numpy as np

In [33]:
# Mount Google Drive if using Google Colab
from google.colab import drive
drive.mount('/content/drive')

# Load datasets
train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# Drop unnecessary columns
drop_col = ['customer_country.1', 'customer_country', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
df_train.drop(drop_col, axis=1, inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)

In [35]:
# 'customer_type' 열에서 특정 값 변경
df_train['customer_type'] = df_train['customer_type'].replace({'End Customer': 'End-Customer', 'Specifier/ Influencer': 'Specifier / Influencer'})
df_test['customer_type'] = df_test['customer_type'].replace({'End Customer': 'End-Customer', 'Specifier/ Influencer': 'Specifier / Influencer'})

In [36]:
# Initialize scalers
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
columns_to_ST = ["com_reg_ver_win_rate", "historical_existing_cnt", "lead_desc_length"]
columns_to_NM = ["ver_win_rate_x", "ver_win_ratio_per_bu"]

In [37]:
# Apply Standardization to each column
for column in columns_to_ST:
    df_train[column + '_standardized'] = scaler_standard.fit_transform(df_train[[column]])
    df_test[column + '_standardized'] = scaler_standard.fit_transform(df_test[[column]])
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

# Apply Normalization to each column
for column in columns_to_NM:
    df_train[column + '_normalized'] = scaler_minmax.fit_transform(df_train[[column]])
    df_test[column + '_normalized'] = scaler_minmax.fit_transform(df_test[[column]])
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

In [38]:
# Create additional features based on 'business_area'
def is_hospital(value):
    if value == "hospital & health care":
        return 1
    else:
        return 0

def is_power(value):
    if value == "power plant / renewable energy":
        return 1
    else:
        return 0

df_train['is_hospital'] = df_train['business_area'].apply(is_hospital)
df_train['is_power'] = df_train['business_area'].apply(is_hospital)
df_test['is_hospital'] = df_test['business_area'].apply(is_hospital)
df_test['is_power'] = df_test['business_area'].apply(is_hospital)


In [39]:
# Define functions for filtering specific words in 'expected_timeline'
def contains_budget(value):
    if 'budget' in str(value).lower():
        return 1
    else:
        return 0

def contains_etc(value):
    if 'etc' in str(value).lower():
        return 1
    else:
        return 0

def contains_hence(value):
    if 'hence' in str(value).lower():
        return 1
    else:
        return 0

def contains_although(value):
    if 'although' in str(value).lower():
        return 1
    else:
        return 0

def contains_more(value):
    if 'more' in str(value).lower():
        return 1
    else:
        return 0

def contains_year(value):
    if 'year' in str(value).lower():
        return 1
    else:
        return 0

df_train['contains_budget'] = df_train['expected_timeline'].apply(contains_budget)
df_test['contains_budget'] = df_test['expected_timeline'].apply(contains_budget)

df_train['contains_etc'] = df_train['expected_timeline'].apply(contains_etc)
df_test['contains_etc'] = df_test['expected_timeline'].apply(contains_etc)

df_train['contains_hence'] = df_train['expected_timeline'].apply(contains_hence)
df_test['contains_hence'] = df_test['expected_timeline'].apply(contains_hence)

df_train['contains_although'] = df_train['expected_timeline'].apply(contains_although)
df_test['contains_although'] = df_test['expected_timeline'].apply(contains_although)

df_train['contains_more'] = df_train['expected_timeline'].apply(contains_more)
df_test['contains_more'] = df_test['expected_timeline'].apply(contains_more)

df_train['contains_year'] = df_train['expected_timeline'].apply(contains_year)
df_test['contains_year'] = df_test['expected_timeline'].apply(contains_year)


In [41]:
# Handling missing values in text columns
text_columns = ["customer_type", "customer_job", "inquiry_type", "product_category",
                "product_subcategory", "product_modelname", "customer_position",
                "response_corporate", "expected_timeline", "business_area"]

df_train[text_columns] = df_train[text_columns].fillna('')
df_test[text_columns] = df_test[text_columns].fillna('')

# Integrate natural language data
df_train['text_data'] = df_train[text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)
df_test['text_data'] = df_test[text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [42]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [43]:
# 텍스트 데이터에 대한 토큰화 및 임베딩 생성
def create_embeddings(sentences):
    return model.encode(sentences, convert_to_tensor=True)

# 결측치를 유사도를 사용하여 채우는 함수
def fill_missing_with_similarity(df, text_column):
    # 결측치와 결측치가 아닌 값으로 행 분리
    missing_rows = df[df[text_column].isnull()]
    non_missing_rows = df.dropna(subset=[text_column])

    # 결측치가 아닌 값에 대한 임베딩 생성
    embeddings = create_embeddings(non_missing_rows[text_column].tolist())

    # 결측치가 있는 행을 반복하고 가장 유사한 결측치가 없는 값을 사용하여 채우기
    for index, missing_row in missing_rows.iterrows():
        missing_embedding = create_embeddings([missing_row[text_column]])[0]
        similarities = cosine_similarity([missing_embedding], embeddings)[0]
        most_similar_index = similarities.argmax()

        # 결측치를 가장 유사한 결측치가 없는 값으로 채우기
        df.at[index, text_column] = non_missing_rows.iloc[most_similar_index][text_column]

# 'text_data' 열에서 결측치를 채우기 위해 함수 적용
fill_missing_with_similarity(df_train, 'text_data')
fill_missing_with_similarity(df_test, 'text_data')

In [44]:
# TF-IDF 변환
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(df_train['text_data'])
tfidf_test = tfidf_vectorizer.transform(df_test['text_data'])

# TruncatedSVD를 사용하여 차원 축소
svd = TruncatedSVD(n_components=50, random_state=42)
svd_train = svd.fit_transform(tfidf_train)
svd_test = svd.transform(tfidf_test)

# 변환된 데이터를 데이터프레임에 추가
df_svd_train = pd.DataFrame(svd_train, columns=[f'svd_{i}' for i in range(svd_train.shape[1])])
df_svd_test = pd.DataFrame(svd_test, columns=[f'svd_{i}' for i in range(svd_test.shape[1])])

df_train = pd.concat([df_train, df_svd_train], axis=1)
df_test = pd.concat([df_test, df_svd_test], axis=1)

# 원래의 텍스트 열 및 중간 열들 삭제
df_train.drop(text_columns + ['text_data'], axis=1, inplace=True)
df_test.drop(text_columns + ['text_data'], axis=1, inplace=True)

In [45]:

# 훈련 데이터와 검증 데이터로 분할
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

cat_features_col = list(x_train.columns)
cat_features_col.remove("bant_submit")
cat_features_col.remove("com_reg_ver_win_rate_standardized")
cat_features_col.remove("historical_existing_cnt_standardized")
cat_features_col.remove("lead_desc_length_standardized")
cat_features_col.remove("ver_cus")
cat_features_col.remove("ver_pro")
cat_features_col.remove("ver_win_rate_x_normalized")
cat_features_col.remove("ver_win_ratio_per_bu_normalized")

for i in range(50):
  cat_features_col.remove("svd_"+str(i))

In [60]:
# Calculate the most frequent value (mode) for each column
modes = df_train.mode().iloc[0]

# Fill missing (NA) values with the mode
df_train.fillna(modes, inplace=True)

# Calculate the most frequent value (mode) for each column
modes = df_test.mode().iloc[0]

# Fill missing (NA) values with the mode
df_test.fillna(modes, inplace=True)

In [70]:
'''
df_train과 df_test에서 숫자 형식인거 다 str로 바꾸기 (일부제외)
'''

# 변환할 라벨들의 리스트
labels_to_convert = [
    "bant_submit",
    "com_reg_ver_win_rate_standardized",
    "historical_existing_cnt_standardized",
    # "id_strategic_ver",
    # "it_strategic_ver",
    # "idit_strategic_ver",
    "lead_desc_length_standardized",
    "ver_cus",
    "ver_pro",
    "ver_win_rate_x_normalized",
    "ver_win_ratio_per_bu_normalized"
]

# df_test와 df_train에서 라벨들의 데이터 타입을 str로 변환
for label in labels_to_convert:
    if label in df_test.columns:
        df_test[label] = df_test[label].astype(str)
    if label in df_train.columns:
        df_train[label] = df_train[label].astype(str)

df_train['is_converted'] = df_train['is_converted'].astype(int)
print(df_test['is_converted'].isnull())
df_test['is_converted'] = df_test['is_converted'].astype(int)
print(df_train['is_converted'].dtype)
print(df_test['is_converted'].dtype)

0       False
1       False
2       False
3       False
4       False
        ...  
5266    False
5267    False
5268    False
5269    False
5270    False
Name: is_converted, Length: 5271, dtype: bool


ValueError: invalid literal for int() with base 10: 'nan'

In [58]:
catboost_model = CatBoostClassifier(
    verbose=0,
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='Logloss',
    one_hot_max_size=5,
    cat_features=cat_features_col
)

In [59]:
catboost_model.fit(x_train, y_train)

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=6]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [23]:
# 검증 데이터에 대한 예측 및 성능 평가
y_val_pred = catboost_model.predict(x_val)
y_val_pred = np.array([True if pred == 'True' else False for pred in y_val_pred])

print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred, labels=[True, False]))
print("\nAccuracy: {:.4f}".format(accuracy_score(y_val, y_val_pred)))
print("Precision: {:.4f}".format(precision_score(y_val, y_val_pred, labels=[True, False])))
print("Recall: {:.4f}".format(recall_score(y_val, y_val_pred)))
print("F1 Score: {:.4f}".format(f1_score(y_val, y_val_pred)))

Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.99      0.99     10913
        True       0.92      0.79      0.85       947

    accuracy                           0.98     11860
   macro avg       0.95      0.89      0.92     11860
weighted avg       0.98      0.98      0.98     11860

Confusion Matrix:
 [[  745   202]
 [   66 10847]]

Accuracy: 0.9774
Precision: 0.9186
Recall: 0.7867
F1 Score: 0.8476


In [31]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

test_pred = catboost_model.predict(x_test)

df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub['is_converted'] = df_sub['is_converted'].astype(bool)
print(df_sub['is_converted'])

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

['True' 'True' 'False' ... 'False' 'False' 'True']
