In [94]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score as acc
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [77]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [78]:
train.head(3)

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1


In [79]:
train.describe(include='all')

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
count,2478,2478,2478,2478,2478.0
unique,2478,2110,1974,2478,
top,TRAIN_0000,United States,United States,"On June 27, 1962, Phil St. Amant, a candidate ...",
freq,1,154,240,1,
mean,,,,,0.665456
std,,,,,0.471926
min,,,,,0.0
25%,,,,,0.0
50%,,,,,1.0
75%,,,,,1.0


In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB


In [81]:
train.drop('ID',axis=1, inplace=True)
test.drop('ID',axis=1, inplace=True)

In [82]:
# train, test 문자열 데이터의 좌우 공백 제거하기
for col in train.columns[:-1]:
    train[col] = train[col].str.strip()
for col in test.columns:
    test[col] = test[col].str.strip()


# 1) et al. , inc. 포함 여부로 회사, 단체 여부 확인

# train_first
lst_many = []
lst_company = []

for first in train['first_party']:
    if "et al." in first:
        lst_many.append(1)
    else:
        lst_many.append(0)

    if "Inc." in first:
        lst_company.append(1)
    else:
        lst_company.append(0)

train['first_many'] = lst_many
train['first_company'] = lst_company

# test_first
lst_many = []
lst_company = []

for first in test['first_party']:
    if "et al." in first:
        lst_many.append(1)
    else:
        lst_many.append(0)

    if "Inc." in first:
        lst_company.append(1)
    else:
        lst_company.append(0)

test['first_many'] = lst_many
test['first_company'] = lst_company

# train_second
lst_many = []
lst_company = []

for second in train['second_party']:
    if "et al." in second:
        lst_many.append(1)
    else:
        lst_many.append(0)

    if "Inc." in second:
        lst_company.append(1)
    else:
        lst_company.append(0)

train['second_many'] = lst_many
train['second_company'] = lst_company

# test_second
lst_many = []
lst_company = []

for second in test['second_party']:
    if "et al." in second:
        lst_many.append(1)
    else:
        lst_many.append(0)

    if "Inc." in second:
        lst_company.append(1)
    else:
        lst_company.append(0)

test['second_many'] = lst_many
test['second_company'] = lst_company

In [83]:
vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [84]:
X_train = get_vector(vectorizer, train, True)
Y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

In [85]:
X_train = pd.concat([pd.DataFrame(X_train), train[['first_many','first_company','second_many','second_company']]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test), test[['first_many','first_company','second_many','second_company']]], axis=1)

In [86]:
# 비율에 맞게 추출
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=.2, stratify=Y_train, random_state=42)
X_tr = X_tr.astype(float)
X_val = X_val.astype(float)
X_test = X_test.astype(float)
print(len(X_tr))
print(len(X_val))
print(len(X_test))

# 비율 맞추기는 고민...

1982
496
1240


In [87]:
X_tr.rename(columns={'first_many':99001,'first_company':99002,'second_many':99003,'second_company':99004}, inplace=True)
X_val.rename(columns={'first_many':99001,'first_company':99002,'second_many':99003,'second_company':99004}, inplace=True)
X_test.rename(columns={'first_many':99001,'first_company':99002,'second_many':99003,'second_company':99004}, inplace=True)

In [88]:
model1 = LogisticRegression(random_state=42)
model1.fit(X_tr, Y_tr)
print(model1.score(X_val, Y_val))

0.6491935483870968


In [90]:
Y_tr.value_counts()

first_party_winner
1    1319
0     663
Name: count, dtype: int64

In [91]:
Y_val.value_counts()

first_party_winner
1    330
0    166
Name: count, dtype: int64

In [92]:
model2 = LGBMClassifier(scale_pos_weight=0.5026535253980288, random_state=42)
model3 = XGBClassifier(scale_pos_weight=0.5026535253980288, random_state=42)

In [93]:
model2.fit(X_tr, Y_tr)
print(model2.score(X_val, Y_val))
model3.fit(X_tr, Y_tr)
print(model3.score(X_val, Y_val))

0.5987903225806451
0.6169354838709677


In [97]:
model4 = CatBoostClassifier(scale_pos_weight=0.5026535253980288, random_state=42, use_best_model=True)
model4.fit(X_tr, Y_tr, eval_set=(X_val, Y_val), verbose=100)

Learning rate set to 0.037523
0:	learn: 0.6913894	test: 0.6926936	best: 0.6926936 (0)	total: 49.6ms	remaining: 49.6s
100:	learn: 0.5859336	test: 0.6991115	best: 0.6880875 (23)	total: 4.9s	remaining: 43.6s
200:	learn: 0.5132920	test: 0.7058491	best: 0.6880875 (23)	total: 9.58s	remaining: 38.1s
300:	learn: 0.4205290	test: 0.7162075	best: 0.6880875 (23)	total: 14.2s	remaining: 33.1s
400:	learn: 0.3427533	test: 0.7320026	best: 0.6880875 (23)	total: 18.9s	remaining: 28.2s
500:	learn: 0.2843163	test: 0.7447566	best: 0.6880875 (23)	total: 23.5s	remaining: 23.4s
600:	learn: 0.2410538	test: 0.7599966	best: 0.6880875 (23)	total: 28.2s	remaining: 18.7s
700:	learn: 0.2052426	test: 0.7733845	best: 0.6880875 (23)	total: 32.8s	remaining: 14s
800:	learn: 0.1763995	test: 0.7889394	best: 0.6880875 (23)	total: 37.4s	remaining: 9.3s
900:	learn: 0.1528489	test: 0.8007475	best: 0.6880875 (23)	total: 42.1s	remaining: 4.62s
999:	learn: 0.1325559	test: 0.8167452	best: 0.6880875 (23)	total: 46.7s	remaining: 0us

<catboost.core.CatBoostClassifier at 0x21cce126f80>

In [98]:
print(model4.score(X_val, Y_val))

0.5544354838709677


In [104]:
submission['first_party_winner'] = model1.predict(X_test)
submission.to_csv('0612_logistic.csv', index=False)

In [105]:
submission['first_party_winner'] = model3.predict(X_test)
submission.to_csv('0612_XGB.csv', index=False)

In [106]:
submission['first_party_winner'] = model2.predict(X_test)
submission.to_csv('0612_lgbm.csv', index=False)