In [6]:
import os
import random
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from imblearn.under_sampling import NeighbourhoodCleaningRule

import tensorflow as tf

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [7]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

seed_everything(42)

In [8]:
# Load Data
train = pd.read_csv('./data/train.csv').drop('ID', axis=1)
test = pd.read_csv('./data/test.csv').drop('ID', axis=1)
submission = pd.read_csv('./data/sample_submission.csv')

In [9]:
# Preprocessing(1)
cols = ['first_party', 'second_party', 'facts']

for col in cols:
    # 문자열 데이터 좌우 공백 제거
    train[col] = train[col].str.strip()
    test[col] = test[col].str.strip()

    # 두 칸 이상의 공백 한 칸으로 변경
    train[col] = train[col].str.replace('  ', ' ')
    test[col] = train[col].str.replace('  ', ' ')

    # 소문자로 변경
    train[col] = train[col].str.lower()
    test[col] = test[col].str.lower()

    # ",", "." 제거
    train[col] = train[col].str.replace(',','')
    train[col] = train[col].str.replace('.','')
    test[col] = test[col].str.replace(',','')
    test[col] = test[col].str.replace('.','')

In [10]:
# preprocessing(2)
first_party_list = []
first_party_list_test = []
second_party_list = []
second_party_list_test = []
facts_list = []
facts_list_test = []

shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


# first_party

## train
for first in train['first_party']:
    # 1글자 단어 제거
    first = shortword.sub('', first)
    # 특수문자 제거
    first = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", first)
    # TreebankWordTokenizer를 이용한 단어 토큰화
    token = tokenizer.tokenize(first)
    # 불용어 제거
    new_token = []
    for tok in token:
        if tok not in stopword_list:
            # 표제어 추출
            new_token.append(lemmatizer.lemmatize(tok, 'n'))

    first_party_list.append(new_token)

# sklearn.feature_extraction 변환을 위해 단어들을 하나로 결합
for i in range(len(first_party_list)):
    first_party_list[i] = ' '.join(first_party_list[i])

## test
for first in test['first_party']:
    # 1글자 단어 제거
    first = shortword.sub('', first)
    # 특수문자 제거
    first = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", first)
    # TreebankWordTokenizer를 이용한 단어 토큰화
    token = tokenizer.tokenize(first)
    # 불용어 제거
    new_token = []
    for tok in token:
        if tok not in stopword_list:
            # 표제어 추출
            new_token.append(lemmatizer.lemmatize(tok, 'n'))

    first_party_list_test.append(new_token)

# sklearn.feature_extraction 변환을 위해 단어들을 하나로 결합
for i in range(len(first_party_list_test)):
    first_party_list_test[i] = ' '.join(first_party_list_test[i])


# second_party

## train
for second in train['second_party']:
    # 1글자 단어 제거
    second = shortword.sub('', second)
    # 특수문자 제거
    second = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", second)
    # TreebankWordTokenizer를 이용한 단어 토큰화
    token = tokenizer.tokenize(second)
    # 불용어 제거
    new_token = []
    for tok in token:
        if tok not in stopword_list:
            # 표제어 추출
            new_token.append(lemmatizer.lemmatize(tok, 'n'))

    second_party_list.append(new_token)

# sklearn.feature_extraction 변환을 위해 단어들을 하나로 결합
for i in range(len(second_party_list)):
    second_party_list[i] = ' '.join(second_party_list[i])

## test
for second in test['second_party']:
    # 1글자 단어 제거
    second = shortword.sub('', second)
    # 특수문자 제거
    second = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", second)
    # TreebankWordTokenizer를 이용한 단어 토큰화
    token = tokenizer.tokenize(second)
    # 불용어 제거
    new_token = []
    for tok in token:
        if tok not in stopword_list:
            # 표제어 추출
            new_token.append(lemmatizer.lemmatize(tok, 'n'))

    second_party_list_test.append(new_token)

# sklearn.feature_extraction 변환을 위해 단어들을 하나로 결합
for i in range(len(second_party_list_test)):
    second_party_list_test[i] = ' '.join(second_party_list_test[i])


# facts

## train
for fact in train['facts']:
    # 1글자 단어 제거
    fact = shortword.sub('', fact)
    # 특수문자 제거
    fact = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", fact)
    # TreebankWordTokenizer를 이용한 단어 토큰화
    token = tokenizer.tokenize(fact)
    # 불용어 제거
    new_token = []
    for tok in token:
        if tok not in stopword_list:
            new_token.append(tok)

    facts_list.append(new_token)

# sklearn.feature_extraction 변환을 위해 단어들을 하나로 결합
for i in range(len(facts_list)):
    facts_list[i] = ' '.join(facts_list[i])

## test
for fact in test['facts']:
    # 1글자 단어 제거
    fact = shortword.sub('', fact)
    # 특수문자 제거
    fact = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", fact)
    # TreebankWordTokenizer를 이용한 단어 토큰화
    token = tokenizer.tokenize(fact)
    # 불용어 제거
    new_token = []
    for tok in token:
        if tok not in stopword_list:
            new_token.append(tok)

    facts_list_test.append(new_token)

# sklearn.feature_extraction 변환을 위해 단어들을 하나로 결합
for i in range(len(facts_list_test)):
    facts_list_test[i] = ' '.join(facts_list_test[i])

In [11]:
# preprocessing(3)
# first, second_party -> count벡터화
# facts -> tfidf벡터화
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer_fact = TfidfVectorizer(ngram_range=(1,2))

vectorizer.fit(first_party_list + second_party_list)
vectorizer_fact.fit(facts_list)

X1 = vectorizer.transform(first_party_list).toarray()
X2 = vectorizer.transform(second_party_list).toarray()
X3 = vectorizer.transform(facts_list).toarray()

X_train = np.concatenate([X1, X2, X3], axis=1)

X1 = vectorizer.transform(first_party_list_test).toarray()
X2 = vectorizer.transform(second_party_list_test).toarray()
X3 = vectorizer.transform(facts_list_test).toarray()

X_test = np.concatenate([X1, X2, X3], axis=1)

y_train = train['first_party_winner']

In [12]:
print('<train 데이터>')
print(X_train.shape, y_train.shape)
print()
print('<test 데이터>')
print(X_test.shape)

<train 데이터>
(2478, 33984) (2478,)

<test 데이터>
(1240, 33984)


In [20]:
# 언더샘플링
X_NC, Y_NC = NeighbourhoodCleaningRule(n_neighbors=5).fit_resample(X_train, y_train)
train_x, val_x, train_y, val_y = train_test_split(X_NC, Y_NC, test_size=.2, stratify=Y_NC, random_state=42)

In [25]:
print('Original')
display(y_train.value_counts())
print('='*30)
print('NCRule DownSampling')
display(Y_NC.value_counts())
print('='*30)
print('Train')
display(train_y.value_counts())
print('='*30)
print('Validation')
display(val_y.value_counts())

Original


first_party_winner
1    1649
0     829
Name: count, dtype: int64

NCRule DownSampling


first_party_winner
1    1007
0     829
Name: count, dtype: int64

Train


first_party_winner
1    805
0    663
Name: count, dtype: int64

Validation


first_party_winner
1    202
0    166
Name: count, dtype: int64

In [None]:
rom sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [27]:
Linear = LinearRegression()
Linear.fit(train_x, train_y)
print(classification_report(val_y, np.where(Linear.predict(val_x)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       0.50      0.59      0.54       166
           1       0.60      0.51      0.56       202

    accuracy                           0.55       368
   macro avg       0.55      0.55      0.55       368
weighted avg       0.56      0.55      0.55       368



In [28]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(train_x, train_y)
print(classification_report(val_y, Logistic.predict(val_x)))

              precision    recall  f1-score   support

           0       0.52      0.52      0.52       166
           1       0.61      0.61      0.61       202

    accuracy                           0.57       368
   macro avg       0.57      0.57      0.57       368
weighted avg       0.57      0.57      0.57       368



In [32]:
# Lasso
Lasso = ElasticNet(alpha=0, random_state=42)
Lasso.fit(train_x, train_y)
print(classification_report(val_y, np.where(Lasso.predict(val_x)>.5, 1, 0)))

  Lasso.fit(train_x, train_y)
  model = cd_fast.enet_coordinate_descent(


              precision    recall  f1-score   support

           0       0.46      0.42      0.44       166
           1       0.56      0.59      0.57       202

    accuracy                           0.52       368
   macro avg       0.51      0.51      0.51       368
weighted avg       0.51      0.52      0.51       368



In [33]:
# Ridge
Ridge = ElasticNet(l1_ratio=0, random_state=42)
Ridge.fit(train_x, train_y)
print(classification_report(val_y, np.where(Ridge.predict(val_x)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       0.53      0.23      0.33       166
           1       0.57      0.83      0.68       202

    accuracy                           0.56       368
   macro avg       0.55      0.53      0.50       368
weighted avg       0.55      0.56      0.52       368



  model = cd_fast.enet_coordinate_descent(


In [35]:
Elastic = ElasticNet(random_state=42)
Elastic.fit(train_x, train_y)
print(classification_report(val_y, np.where(Elastic.predict(val_x)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       166
           1       0.55      1.00      0.71       202

    accuracy                           0.55       368
   macro avg       0.27      0.50      0.35       368
weighted avg       0.30      0.55      0.39       368



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
Tree = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=42)
Tree.fit(train_x, train_y)
print(classification_report(val_y, Tree.predict(val_x)))

              precision    recall  f1-score   support

           0       0.72      0.20      0.31       166
           1       0.59      0.94      0.72       202

    accuracy                           0.60       368
   macro avg       0.65      0.57      0.52       368
weighted avg       0.65      0.60      0.54       368

