In [2]:
import os
import random
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.ensemble import BaggingClassifier, StackingClassifier

# from sklego.linear_model import DemographicParityClassifier, EqualOpportunityClassifier

from utils_two import *

# nltk.download()

In [3]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

In [4]:
# Load Data
train = pd.read_csv('./data/train.csv').drop('ID', axis=1)
test = pd.read_csv('./data/test.csv').drop('ID', axis=1)
submission = pd.read_csv('./data/sample_submission.csv')

In [5]:
# 문자열 전처리
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

first_train, second_train, facts_train = preprocessing(train, cols, shortword, tokenizer, stopword, lemmatizer)
first_test, second_test, facts_test = preprocessing(test, cols, shortword, tokenizer, stopword, lemmatizer)

vec = CountVectorizer(ngram_range=(1,2))
vec_facts = TfidfVectorizer(ngram_range=(1,2))

X_train = preprocessing_2(first_train, second_train, facts_train, vec, vec_facts)
y_train = train['first_party_winner']
X_test = preprocessing_2(first_test, second_test, facts_test, vec, vec_facts, train=False)

In [6]:
print('<train 데이터>')
print(X_train.shape, y_train.shape)
print()
print('<test 데이터>')
print(X_test.shape)

<train 데이터>
(2478, 211292) (2478,)

<test 데이터>
(1240, 211292)


In [7]:
# 데이터 불균형 문제 전처리(언더샘플링)
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=3).fit_resample(X_train, y_train)
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(y_nc.value_counts())

Train Data Shape after UnderSampling
(1643, 211292) (1643,)
Train target after UnderSampling
first_party_winner
0    829
1    814
Name: count, dtype: int64


In [8]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(1232, 211292) (1232,)
--------------------
Train target
first_party_winner
0    622
1    610
Name: count, dtype: int64
Validation Data Shape
(411, 211292) (411,)
--------------------
Validation target
first_party_winner
0    207
1    204
Name: count, dtype: int64


In [26]:
Logistic = LogisticRegression(max_iter=500, random_state=42, C=10)
Logistic.fit(Train_X, Train_y)
# print(classification_report(Val_y, Logistic.predict(Val_X)))
cv_score = cross_val_score(Logistic, Val_X, Val_y)
print(cv_score.mean())
print(cv_score)

0.6300617102556568
[0.6746988  0.54878049 0.64634146 0.67073171 0.6097561 ]


In [9]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
# print(classification_report(Val_y, Logistic.predict(Val_X)))
cv_score = cross_val_score(Logistic, Val_X, Val_y)

In [25]:
print(cv_score.mean())
print(cv_score)

0.6349985307081988
[0.65060241 0.54878049 0.65853659 0.67073171 0.64634146]


In [44]:
from sklearn.metrics import accuracy_score as acc
tr_df = pd.DataFrame(X_nc)
tr_df['target'] = y_nc
tr_df

oof_scores = []
models = []
pred_lst = []

kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
n_iter = 0
for train_idx, val_idx in kfold.split(tr_df, tr_df['target']):
    tr_x = tr_df.drop('target', axis=1)
    tr_y = tr_df['target']

    n_iter+=1
    model = LogisticRegression(max_iter=500, random_state=42)
    models.append(model)
    model.fit(tr_x.iloc[train_idx], tr_y.iloc[train_idx])
    pred = model.predict(X_test)
    pred_lst.append(pred)
    score = acc(tr_y.iloc[val_idx], model.predict(tr_x.iloc[val_idx]))
    oof_scores.append(score)
    

In [53]:
pr_res = pred_lst[0]+pred_lst[1]+pred_lst[2]+pred_lst[3]+pred_lst[4]

res = np.where(pr_res>=3,1,0)
submission['first_party_winner'] = res
submission.to_csv('oof_pred.csv', index=False)

In [63]:
temd = PassiveAggressiveClassifier(random_state=42)
temd.fit(Train_X, Train_y)
print(classification_report(Val_y, temd.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.67      0.70      0.68       207
           1       0.68      0.65      0.66       204

    accuracy                           0.67       411
   macro avg       0.67      0.67      0.67       411
weighted avg       0.67      0.67      0.67       411



In [64]:
temd = SGDClassifier(random_state=42)
temd.fit(Train_X, Train_y)
print(classification_report(Val_y, temd.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.69      0.55      0.61       207
           1       0.62      0.75      0.68       204

    accuracy                           0.65       411
   macro avg       0.66      0.65      0.65       411
weighted avg       0.66      0.65      0.65       411



In [65]:
temd = RidgeClassifier(random_state=42)
temd.fit(Train_X, Train_y)
print(classification_report(Val_y, temd.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.64      0.69      0.67       207
           1       0.66      0.61      0.64       204

    accuracy                           0.65       411
   macro avg       0.65      0.65      0.65       411
weighted avg       0.65      0.65      0.65       411



In [69]:
estimators = [
    ('Ridge', RidgeClassifier(random_state=42)),
    ('SGD', SGDClassifier(random_state=42)),
    ('PA', PassiveAggressiveClassifier(random_state=42))
]
Stack = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(max_iter=500, random_state=42),
    cv=4
    )
Stack.fit(X_nc, y_nc)
Stack_res = Stack.predict(X_test)

In [71]:
submission['first_party_winner']=Stack_res
submission.to_csv('stacking.csv',index=False)

In [9]:
# submission['first_party_winner'] = Logistic.predict(X_test)
# submission.to_csv('logi_linear.csv', index=False)

In [10]:
submission['first_party_winner'] = Logistic.predict(X_test)
submission.to_csv('logi___2_test.csv', index=False)