In [1]:
import pandas as pd
import numpy as np
import random
import os
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier, PassiveAggressiveClassifier, Perceptron, LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.utils import check_random_state
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from nltk.tokenize import word_tokenize, sent_tokenize
import text_hammer as th

from scipy import sparse

from tqdm import tqdm

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
print("Number of rows in train dataset : ", train.shape[0])

Number of rows in train dataset :  2478


In [3]:
CFG = {
    # 'EPOCHS': 20,
    # 'LEARNING_RATE': 2e-5,
    # 'BATCH_SIZE': 8,
    'SEED': 42,
    # 'MAX_LEN': 384,
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    check_random_state(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
# train 데이터프레임에서 facts 열의 각 행(row)에 대해 문장 수를 세는 함수를 정의합니다.
def count_sentences(row):
    sentences = sent_tokenize(row)
    return len(sentences)

# facts 열의 문장 수를 세어서 새로운 'num_sentences' 열을 추가합니다.
train['num_sentences'] = train['facts'].apply(lambda x: count_sentences(x))

# 문장 수가 1개인 행(row)들만 선택합니다.
rows_with_single_sentence = train[train['num_sentences'] == 1]

# 결과를 DataFrame으로 출력합니다.
rows_with_single_sentence

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,num_sentences
248,TRAIN_0248,New York,Ferber,A New York child pornography law prohibited pe...,1,1
554,TRAIN_0554,Riverside County,McLaughlin,McLaughlin was arrested without a warrant and ...,1,1
630,TRAIN_0630,Earl R. Deen,"Gulf, Colorado & Santa Fe Railway Company",Not Available.\n,1,1
764,TRAIN_0764,Nollan,California Coastal Commission,The California Coastal Commission required own...,1,1
858,TRAIN_0858,Morrison,Olson,The Ethics in Government Act of 1978 created a...,1,1
1104,TRAIN_1104,Eastern Enterprises,Apfel,Currently unknown.\n,1,1
1285,TRAIN_1285,Smith,Daily Mail Publishing Company,A West Virginia statute made it a crime for a ...,0,1
1319,TRAIN_1319,National Association for the Advancement of Co...,Button,The NAACP was prosecuted for violating a Virgi...,1,1
1329,TRAIN_1329,United States,Paradise,In response to a series of NAACP-initiated law...,0,1
1461,TRAIN_1461,Jones,Alfred H. Mayer Company,"Jones, a black man, charged that a real estate...",1,1


In [6]:
# 특정 인덱스 삭제 - Not available, Currently available, Currently unknown
indexes_to_remove = [630, 1104, 1595]
train = train.drop(indexes_to_remove)
train = train.reset_index(drop=True)

In [7]:
# 'facts' 열에서 'Not available'이나 'Currently available', 'Currently unknown' 문구가 있는지 확인
is_not_available = train['facts'].str.contains('Not available', case=False)
is_currently_available = train['facts'].str.contains('Currently available', case=False)
is_currently_unknown = train['facts'].str.contains('Currently unknown', case=False)

# 결과 출력 -> 위 문구 외에 다른 문장 있으므로 패스
train[is_not_available | is_currently_available | is_currently_unknown]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,num_sentences
86,TRAIN_0086,Brown,Louisiana,The Audubon Regional library operated three br...,1,7
574,TRAIN_0574,Gil Garcetti et al.,Richard Ceballos,"Richard Ceballos, an employee of the Los Angel...",1,6
648,TRAIN_0649,"Friends of the Earth, Inc.","Laidlaw Environmental Services (TOC), Inc.","After Laidlaw Environmental Services, Inc. bou...",1,13
1061,TRAIN_1062,"Kenneth F. Fare, Acting Chief Probation Officer",Michael C.,"Police arrested Michael C., a 16 year old, on ...",1,9
1610,TRAIN_1613,"Atlantic Sounding Co., Inc., et al.",Edgar L. Townsend,"In July 2005, Edgar Townsend was allegedly inj...",0,13
2230,TRAIN_2233,"Verizon Communications, Inc.",Federal Communications Commission,The Telecommunications Act of 1996 entitles ne...,0,5


In [8]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = th.cont_exp(x)
    x = th.remove_emails(x)
    x = th.remove_urls(x)
    x = th.remove_html_tags(x)
    x = th.remove_rt(x)
    x = th.remove_accented_chars(x)
    x = th.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [9]:
# 라벨 위치를 바꾸는 함수
def swap_labels(row):
    if row['first_party_winner'] == 1:
        row['first_party_winner'] = 0
    else:
        row['first_party_winner'] = 1
    return row

In [10]:
train = train[train['num_sentences'] >= 3]

In [11]:
# first_party와 second_party 열의 라벨 위치를 바꾸고 first_party_winner 열의 값을 반전시킨 데이터프레임 생성
train_swapped = train.copy()
train_swapped[['first_party', 'second_party']] = train_swapped[['second_party', 'first_party']]
train_swapped = train_swapped.apply(swap_labels, axis=1)
train_extended = pd.concat([train, train_swapped])
train_extended['facts'] = train_extended['facts'].apply(lambda x : get_clean(x))
train_extended['first_party'] = train_extended['first_party'].apply(lambda x : get_clean(x))
train_extended['second_party'] = train_extended['second_party'].apply(lambda x : get_clean(x))
train_extended.reset_index(drop=True, inplace=True)

test['facts'] = test['facts'].apply(lambda x: get_clean(x))
test['first_party'] = test['first_party'].apply(lambda x : get_clean(x))
test['second_party'] = test['second_party'].apply(lambda x : get_clean(x))

In [12]:
train_extended

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,num_sentences
0,TRAIN_0000,phil a st amant,herman a thompson,on june 27 1962 phil st amant a candidate for ...,1,7
1,TRAIN_0001,stephen duncan,lawrence owens,ramon nelson was riding his bike when he suffe...,0,7
2,TRAIN_0002,billy joe magwood,tony patterson warden et al,an alabama state court convicted billy joe mag...,1,8
3,TRAIN_0003,linkletter,walker,victor linkletter was convicted in state court...,0,3
4,TRAIN_0004,william earl fikes,alabama,on april 24 1953 in selma alabama an intruder ...,1,9
...,...,...,...,...,...,...
4777,TRAIN_2473,renewable fuels association et al,hollyfrontier cheyenne refining llc et al,congress amended the clean air act through the...,0,5
4778,TRAIN_2474,alliance bond fund inc,grupo mexicano de desarrollo s a,alliance bond fund inc an investment fund purc...,0,7
4779,TRAIN_2475,united states,peguero,in 1992 the district court sentenced manuel d ...,1,6
4780,TRAIN_2476,st cyr,immigration and naturalization service,on march 8 1996 enrico st cyr a lawful permane...,1,8


In [13]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=3)
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = sparse.hstack([X_party1, X_party2, X_facts])
    return X

In [14]:
X_train = get_vector(vectorizer, train_extended, True)
Y_train = train_extended["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

In [16]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [None]:
ridge_model = RidgeClassifier()
ETC_model = ExtraTreesClassifier()
logistic_model = LogisticRegression()
RFC_model = RandomForestClassifier()
ABS_model = AdaBoostClassifier()
SGD_model = SGDClassifier()
xgb_model = XGBClassifier()
lgbm_model = LGBMClassifier()
cat_model = CatBoostClassifier()

ridge_model.fit(X_train, Y_train)
ETC_model.fit(X_train, Y_train)
logistic_model.fit(X_train, Y_train)
RFC_model.fit(X_train, Y_train)
ABS_model.fit(X_train, Y_train)
SGD_model.fit(X_train, Y_train)
xgb_model.fit(X_train, Y_train)
lgbm_model.fit(X_train, Y_train)
cat_model.fit(X_train, Y_train)

# VotingClassifier 생성
voting_model = VotingClassifier(
    estimators=[('ridge', ridge_model), ('ETC', ETC_model), ('logistic', logistic_model), ('RFC', RFC_model), ('ABS', ABS_model), ('SGD', SGD_model), ('xgb', xgb_model), ('lgbm', lgbm_model), ('cat', cat_model)], 
    voting='hard'
)

voting_model.fit(X_train, Y_train)

In [18]:
submit = pd.read_csv('Data/sample_submission.csv')

In [19]:
pred = voting_model.predict(X_test)

In [22]:
submit['first_party_winner'] = pred
submit.to_csv('./submit_voting_ensemble_5.csv', index=False)
print('Done')

Done
