In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import pandas as pd
import re

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

In [2]:
nltk.data.path.append('nltk_data')

def text_summarize(text):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)
    
    top_n = 1 #문장 개수
    top_sentence_indices = sentence_vectors.sum(axis=1).argsort()[-top_n:] #가장 vector 수치가 높은 문장 하나만 선택
    top_sentence_indices.sort()
    
    summary = ''.join(sentences[i.item()] for i in top_sentence_indices)
    
    return summary

train = pd.read_csv('./train.csv')
train['facts'] = train['facts'].apply(lambda x:text_summarize(x))

test = pd.read_csv('./test.csv')
test['facts'] = test['facts'].apply(lambda x: text_summarize(x))

submit = pd.read_csv('./sample_submission.csv')

def alpha_num(text):
    return re.sub(r"[^A-Za-z0-9]","", text)

#불용어 제거
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could",
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has",
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him",
             "himself",
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
             "itself",
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours",
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some",
             "such", "than", "that",
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
             "they'd", "they'll",
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we",
             "we'd", "we'll",
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who",
             "who's", "whom",
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
             "yourselves"]

def remove_stopwords(text):
    final_text=[]
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 중복 제거
def delete_duplication(text):
    words = text.split()
    unique_words = []
    
    for word in words:
        if word  not in unique_words:
            unique_words.append(word)
            
    result = ''.join(unique_words)
    return result


train['facts'] = train['facts'].str.lower()
test['facts'] = test['facts'].str.lower()
train['facts'] = train['facts'].apply(alpha_num).apply(remove_stopwords).apply(delete_duplication)
test['facts'] = test['facts'].apply(alpha_num).apply(remove_stopwords).apply(delete_duplication)   

In [4]:
vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.toarray(), X_party2.toarray(), X_facts.toarray()], axis=1)
    return X

X = get_vector(vectorizer, train, True)
y = train['first_party_winner']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state = 42)

model = MLPRegressor(hidden_layer_sizes=300) #다중 퍼셉트론
model.fit(X_train, y_train)

X_test = get_vector(vectorizer, test, False)
pred = model.predict(X_test)
submit['first_party_winner'] = pred
submit['first_party_winner'] = submit['first_party_winner'].apply(lambda x:1 if x>= 0.5 else 0)

submit.to_csv('./summarize_mlp_r_submit.csv', index=False)