# Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, BaggingClassifier

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import tensorflow as tf
from tensorflow import keras

from utils_two import *

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

seed_everything(42)

# Load Data

In [3]:
# Load Data
train = pd.read_csv('./data/train.csv').drop('ID', axis=1)
test = pd.read_csv('./data/test.csv').drop('ID', axis=1)
submission = pd.read_csv('./data/sample_submission.csv')

# Preprocessing

In [4]:
# 문자열 전처리
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

first_train, second_train, facts_train = preprocessing(train, cols, shortword, tokenizer, stopword, lemmatizer)
first_test, second_test, facts_test = preprocessing(test, cols, shortword, tokenizer, stopword, lemmatizer)

vec = CountVectorizer(ngram_range=(1,2))
vec_facts = TfidfVectorizer(ngram_range=(1,2))

X_train = preprocessing_2(first_train, second_train, facts_train, vec, vec_facts)
y_train = train['first_party_winner']
X_test = preprocessing_2(first_test, second_test, facts_test, vec, vec_facts, train=False)

# Modeling

In [5]:
print('Train Data Shape')
print(X_train.shape, y_train.shape)
print('='*20)
print('Train target')
print(y_train.value_counts())
print('='*20)
print('Test Data Shape')
print(X_test.shape)

Train Data Shape
(2478, 211292) (2478,)
Train target
first_party_winner
1    1649
0     829
Name: count, dtype: int64
Test Data Shape
(1240, 211292)


In [6]:
# 데이터 불균형 문제 전처리(언더샘플링)
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=3).fit_resample(X_train, y_train)
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(y_nc.value_counts())

Train Data Shape after UnderSampling
(1643, 211292) (1643,)
Train target after UnderSampling
first_party_winner
0    829
1    814
Name: count, dtype: int64


In [7]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(1232, 211292) (1232,)
--------------------
Train target
first_party_winner
0    622
1    610
Name: count, dtype: int64
Validation Data Shape
(411, 211292) (411,)
--------------------
Validation target
first_party_winner
0    207
1    204
Name: count, dtype: int64


In [8]:
Linear = LinearRegression()
Linear.fit(Train_X, Train_y)
print(classification_report(Val_y, np.where(Linear.predict(Val_X)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       0.65      0.69      0.67       207
           1       0.66      0.62      0.64       204

    accuracy                           0.65       411
   macro avg       0.66      0.65      0.65       411
weighted avg       0.66      0.65      0.65       411



In [9]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.66      0.69      0.68       207
           1       0.67      0.64      0.66       204

    accuracy                           0.67       411
   macro avg       0.67      0.67      0.67       411
weighted avg       0.67      0.67      0.67       411



In [10]:
Tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
Tree.fit(Train_X, Train_y)
print(classification_report(Val_y, Tree.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.60      0.53      0.56       207
           1       0.57      0.64      0.61       204

    accuracy                           0.59       411
   macro avg       0.59      0.59      0.59       411
weighted avg       0.59      0.59      0.59       411



In [11]:
LGBM =LGBMClassifier(objective='binary', n_estimators=1000, learning_rate=.01, metric='binary_error', random_state=42)
LGBM.fit(Train_X, Train_y, eval_metric='binary_error', eval_set=[(Train_X, Train_y), (Val_X, Val_y)], verbose=False, early_stopping_rounds=100)
print(classification_report(Val_y, LGBM.predict(Val_X)))



              precision    recall  f1-score   support

           0       0.64      0.65      0.64       207
           1       0.64      0.62      0.63       204

    accuracy                           0.64       411
   macro avg       0.64      0.64      0.64       411
weighted avg       0.64      0.64      0.64       411



In [12]:
CAT = CatBoostClassifier(iterations=500, objective='Logloss', eval_metric='Logloss', random_state=42, use_best_model=True)
CAT.fit(Train_X, Train_y, eval_set=(Val_X, Val_y), early_stopping_rounds=100, verbose=False)
print(classification_report(Val_y, CAT.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.59      0.60      0.60       207
           1       0.59      0.57      0.58       204

    accuracy                           0.59       411
   macro avg       0.59      0.59      0.59       411
weighted avg       0.59      0.59      0.59       411



In [53]:
Bag = BaggingClassifier(estimator=LogisticRegression(max_iter=500, random_state=42), random_state=42)
Bag.fit(Train_X, Train_y)
print(classification_report(Val_y, Bag.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.66      0.71      0.68       207
           1       0.68      0.63      0.65       204

    accuracy                           0.67       411
   macro avg       0.67      0.67      0.67       411
weighted avg       0.67      0.67      0.67       411



In [54]:
Vot = VotingClassifier(estimators=[
    ('Logistic', LogisticRegression(max_iter=500, random_state=42)),
    ('LGBM', LGBMClassifier(objective='binary', n_estimators=23, learning_rate=.01, metric='binary_error', random_state=42))
    # ('Tree', DecisionTreeClassifier(criterion='entropy', random_state=42)),
    ])
Vot.fit(Train_X, Train_y)
print(classification_report(Val_y, Vot.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.62      0.83      0.71       207
           1       0.73      0.48      0.58       204

    accuracy                           0.65       411
   macro avg       0.68      0.65      0.64       411
weighted avg       0.68      0.65      0.64       411



In [52]:
Vot = VotingClassifier(estimators=[
    ('Logistic', LogisticRegression(max_iter=500, random_state=42)),
    ('LGBM', LGBMClassifier(objective='binary', n_estimators=23, learning_rate=.01, metric='binary_error', random_state=42))
    # ('Tree', DecisionTreeClassifier(criterion='entropy', random_state=42)),
    ])
Vot.fit(Train_X, Train_y)
print(classification_report(Val_y, Vot.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.66      0.65      0.66       207
           1       0.65      0.66      0.66       204

    accuracy                           0.66       411
   macro avg       0.66      0.66      0.66       411
weighted avg       0.66      0.66      0.66       411



In [23]:
batch_size = 64
input_dim = Train_X.shape[1]
units = 512
epoch = 50

model = keras.models.Sequential([
        keras.layers.Dense(units, input_shape=(input_dim,), activation='relu'),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dropout(.5),
        keras.layers.Dense(1, activation='sigmoid')
])

es = keras.callbacks.EarlyStopping(monitor='val_acc', patience=10, verbose=1, mode='max', restore_best_weights=True)
mc = keras.callbacks.ModelCheckpoint(f"./model/LSTM.h5", save_best_only=True)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=.9, patience=5, verbose=1, mode='max', min_lr=.0005)
csv = keras.callbacks.CSVLogger("./model/LSTM.log")

loss = keras.losses.BinaryCrossentropy()
optimizer = keras.optimizers.Adam(learning_rate=.0001)

model.summary()
model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 512)               108182016 
                                                                 
 dense_4 (Dense)             (None, 256)               131328    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 257       
                                                                 
Total params: 108,313,601
Trainable params: 108,313,601
Non-trainable params: 0
_________________________________________________________________


In [24]:
hist = model.fit(Train_X, Train_y, validation_data=(Val_X, Val_y), batch_size=batch_size, epochs=epoch, callbacks=[es, mc, rlr, csv])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 17: early stopping


In [25]:
print(classification_report(Val_y, np.where(model.predict(Val_X)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       0.71      0.57      0.63       207
           1       0.64      0.76      0.69       204

    accuracy                           0.67       411
   macro avg       0.67      0.67      0.66       411
weighted avg       0.67      0.67      0.66       411



In [15]:
print(classification_report(Val_y, np.where(model.predict(Val_X)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       0.66      0.69      0.67       207
           1       0.67      0.64      0.66       204

    accuracy                           0.66       411
   macro avg       0.66      0.66      0.66       411
weighted avg       0.66      0.66      0.66       411



In [22]:
res = model.predict(X_test)



In [30]:
res2 = model.predict(X_test)



In [31]:
model.evaluate(Val_X, Val_y)



[0.6568349003791809, 0.6666666865348816]

In [47]:
batch_size = 32
input_dim = Train_X.shape[1]
epoch = 30
embedding_dim= 4

model2 = keras.models.Sequential([
        keras.layers.Embedding(input_dim, embedding_dim),
        keras.layers.LSTM(128, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
])

es = keras.callbacks.EarlyStopping(monitor='val_acc', patience=10, verbose=1, mode='max', restore_best_weights=True)
mc = keras.callbacks.ModelCheckpoint(f"./model/LSTM.h5", save_best_only=True)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=.9, patience=5, verbose=1, mode='max', min_lr=.0005)
csv = keras.callbacks.CSVLogger("./model/LSTM.log")

loss = keras.losses.BinaryCrossentropy()
optimizer = keras.optimizers.Adam(learning_rate=.0001)

model2.summary()
model2.compile(loss=loss, optimizer=optimizer, metrics=['acc'])

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 4)           845168    
                                                                 
 lstm_9 (LSTM)               (None, 128)               68096     
                                                                 
 dense_15 (Dense)            (None, 1)                 129       
                                                                 
Total params: 913,393
Trainable params: 913,393
Non-trainable params: 0
_________________________________________________________________


In [45]:
hist = model2.fit(Train_X, Train_y, validation_data=(Val_X, Val_y), batch_size=batch_size, epochs=epoch, callbacks=[es, mc, rlr, csv])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 19: early stopping


In [None]:
print(classification_report(Val_y, np.where(model.predict(Val_X)>.5, 1, 0)))

In [48]:
model2.evaluate(Val_X, Val_y)



[0.6931524276733398, 0.49635037779808044]

In [43]:
model.evaluate(Val_X, Val_y)



[0.8538404107093811, 0.6326034069061279]

In [37]:
model.evaluate(Val_X, Val_y)



[0.8349686861038208, 0.6301703453063965]

In [27]:
submission['first_party_winner'] = Logistic.predict(X_test)

In [29]:
submission.to_csv('logi.csv', index=False)