# Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
import re

from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Fix Seed

In [2]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

# Load Data

In [3]:
# Load Data
train = pd.read_csv('./data/train.csv').drop('ID', axis=1)
test = pd.read_csv('./data/test.csv').drop('ID', axis=1)
submission = pd.read_csv('./data/sample_submission.csv')

# Preprocessing
- 문자열 전처리, 벡터화

In [4]:
# 문자열 전처리
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# 전처리 함수 1
def preprocessing(df, cols, shortword, tokenizer, stopword, lemmatizer):    
    first_party_lst = []
    second_party_lst = []
    facts_lst = []
    for col in cols:
        # 좌우 공백 제거
        df[col] = df[col].str.strip()
        # 두 칸 이상의 공백 한 칸으로 변경
        df[col] = df[col].str.replace('  ', ' ')
        # 소문자로 변경
        df[col] = df[col].str.lower()
        # ",", "." 제거
        df[col] = df[col].str.replace(',','')
        df[col] = df[col].str.replace('.','')

        if col == 'first_party':
            for sample in df[col]:
                # 한글자 단어 제거
                sample = shortword.sub('', sample)
                # 특수문자 제거
                sample = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sample)
                # tokenzier를 이용한 단어 토큰화
                token = tokenizer.tokenize(sample)
                # 불용어 제거
                new_token = []
                for tok in token:
                    if tok not in stopword:
                        # 표제어 추출
                        new_token.append(lemmatizer.lemmatize(tok, 'n'))
                first_party_lst.append(new_token)
            # sklearn.feature_extraction 변환을 위해 단어들을 결합
            for i in range(len(first_party_lst)):
                first_party_lst[i] = ' '.join(first_party_lst[i])

        elif col == 'second_party':
            for sample in df[col]:
                # 한글자 단어 제거
                sample = shortword.sub('', sample)
                # 특수문자 제거
                sample = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sample)
                # tokenzier를 이용한 단어 토큰화
                token = tokenizer.tokenize(sample)
                # 불용어 제거
                new_token = []
                for tok in token:
                    if tok not in stopword:
                        # 표제어 추출
                        new_token.append(lemmatizer.lemmatize(tok, 'n'))
                second_party_lst.append(new_token)
            # sklearn.feature_extraction 변환을 위해 단어들을 결합
            for i in range(len(second_party_lst)):
                second_party_lst[i] = ' '.join(second_party_lst[i])

        elif col=='facts':
            for sample in df[col]:
                # 한글자 단어 제거
                sample = shortword.sub('', sample)
                # 특수문자 제거
                sample = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sample)
                # tokenzier를 이용한 단어 토큰화
                token = tokenizer.tokenize(sample)
                # 불용어 제거
                new_token = []
                for tok in token:
                    if tok not in stopword:
                        new_token.append(tok)
                facts_lst.append(new_token)
            # sklearn.feature_extraction 변환을 위해 단어들을 결합
            for i in range(len(facts_lst)):
                facts_lst[i] = ' '.join(facts_lst[i])

        else:
            print('컬럼이름을 변경하지 말아주세요!')

    return first_party_lst, second_party_lst, facts_lst

# 전처리 함수 2(벡터화)                
def preprocessing_2(first, second, facts, vec, vec_facts, train=True):
    if train:
        vec.fit(first + second)
        vec_facts.fit(facts)

    X1 = vec.transform(first).toarray()
    X2 = vec.transform(second).toarray()
    X3 = vec_facts.transform(facts).toarray()

    return np.concatenate([X1, X2, X3], axis=1)

In [5]:
# 문자열 전처리 1
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

first_train, second_train, facts_train = preprocessing(train, cols, shortword, tokenizer, stopword, lemmatizer)
first_test, second_test, facts_test = preprocessing(test, cols, shortword, tokenizer, stopword, lemmatizer)

# 문자열 전처리 2(벡터화)
vec = CountVectorizer(ngram_range=(1,2))
vec_facts = TfidfVectorizer(ngram_range=(1,2))

X_train = preprocessing_2(first_train, second_train, facts_train, vec, vec_facts)
y_train = train['first_party_winner']
X_test = preprocessing_2(first_test, second_test, facts_test, vec, vec_facts, train=False)

In [6]:
print('<train 데이터>')
print(X_train.shape, y_train.shape)
print()
print('<test 데이터>')
print(X_test.shape)

<train 데이터>
(2478, 211292) (2478,)

<test 데이터>
(1240, 211292)


# Preprocessing2
- 불균형 데이터 전처리(다운샘플링)

In [7]:
# 데이터 불균형 문제 전처리(언더샘플링)
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=3).fit_resample(X_train, y_train)
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(y_nc.value_counts())

Train Data Shape after UnderSampling
(1643, 211292) (1643,)
Train target after UnderSampling
first_party_winner
0    829
1    814
Name: count, dtype: int64


# Train, Validation Split

In [8]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(1232, 211292) (1232,)
--------------------
Train target
first_party_winner
0    622
1    610
Name: count, dtype: int64
Validation Data Shape
(411, 211292) (411,)
--------------------
Validation target
first_party_winner
0    207
1    204
Name: count, dtype: int64


# Modeling

In [9]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.66      0.69      0.68       207
           1       0.67      0.64      0.66       204

    accuracy                           0.67       411
   macro avg       0.67      0.67      0.67       411
weighted avg       0.67      0.67      0.67       411



# Submit

In [10]:
submission['first_party_winner'] = Logistic.predict(X_test)
submission.to_csv('logi___2.csv', index=False)