In [51]:
import os
import random
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, BaggingClassifier, StackingClassifier

from utils_two import *

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [52]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

In [53]:
# Load Data
train = pd.read_csv('./data/train.csv').drop('ID', axis=1)
test = pd.read_csv('./data/test.csv').drop('ID', axis=1)
submission = pd.read_csv('./data/sample_submission.csv')

In [54]:
# 문자열 전처리
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

first_train, second_train, facts_train = preprocessing(train, cols, shortword, tokenizer, stopword, lemmatizer)
first_test, second_test, facts_test = preprocessing(test, cols, shortword, tokenizer, stopword, lemmatizer)

vec = CountVectorizer(ngram_range=(1,2))
vec_facts = TfidfVectorizer(ngram_range=(1,2))

X_train = preprocessing_2(first_train, second_train, facts_train, vec, vec_facts)
y_train = train['first_party_winner']
X_test = preprocessing_2(first_test, second_test, facts_test, vec, vec_facts, train=False)

In [55]:
print('Train Data Shape')
print(X_train.shape, y_train.shape)
print('='*20)
print('Train target')
print(y_train.value_counts())
print('='*20)
print('Test Data Shape')
print(X_test.shape)

Train Data Shape
(2478, 211292) (2478,)
Train target
first_party_winner
1    1649
0     829
Name: count, dtype: int64
Test Data Shape
(1240, 211292)


In [56]:
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=5).fit_resample(X_train, y_train)
print('Train Data Shape after OverSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after OverSampling')
print(y_nc.value_counts())

Train Data Shape after OverSampling
(1601, 211292) (1601,)
Train target after OverSampling
first_party_winner
0    829
1    772
Name: count, dtype: int64


In [57]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.2, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(1280, 211292) (1280,)
--------------------
Train target
first_party_winner
0    663
1    617
Name: count, dtype: int64
Validation Data Shape
(321, 211292) (321,)
--------------------
Validation target
first_party_winner
0    166
1    155
Name: count, dtype: int64


In [60]:
Model = LogisticRegression(C=10, max_iter=500, random_state=42)
Model.fit(Train_X, Train_y)
print(classification_report(Val_y, Model.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.68      0.75      0.71       166
           1       0.70      0.62      0.66       155

    accuracy                           0.69       321
   macro avg       0.69      0.68      0.68       321
weighted avg       0.69      0.69      0.68       321



In [61]:
preds = Model.predict(X_test)
submission['first_party_winner'] = preds
submission.to_csv('Logi_new.csv',index=False)

In [21]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80       424
           1       0.81      0.77      0.79       413

    accuracy                           0.80       837
   macro avg       0.80      0.80      0.80       837
weighted avg       0.80      0.80      0.80       837



In [24]:
preds = Logistic.predict(X_test)

In [27]:
submission['first_party_winner'] = preds
submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1
...,...,...
1235,TEST_1235,0
1236,TEST_1236,1
1237,TEST_1237,1
1238,TEST_1238,1


In [28]:
submission.to_csv('Over_Logi.csv', index=False)

In [30]:
from sklearn.linear_model import LinearRegression
ela = LinearRegression()
ela.fit(X_train, y_train)
print(classification_report(Val_y, np.where(ela.predict(Val_X)>.5, 1, 0)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       424
           1       1.00      1.00      1.00       413

    accuracy                           1.00       837
   macro avg       1.00      1.00      1.00       837
weighted avg       1.00      1.00      1.00       837



In [34]:
preds = np.where(ela.predict(X_test)>.5,1,0)

In [36]:
submission['first_party_winner'] = preds
submission.to_csv('linear_over.csv',index=False)