In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

### Data Preprocessing

#### Data load

In [2]:
file_path = 'C:/Users/SJH/OneDrive - korea.ac.kr/문서/datasets/'

In [3]:
train_data = pd.read_csv(file_path + 'i1e0_train.csv', encoding='UTF-8')
val_data = pd.read_csv(file_path + 'i1e0_validation.csv', encoding='UTF-8')
test_data = pd.read_csv(file_path + 'i1e0_test.csv', encoding='UTF-8')

In [4]:
# train_data = train_data.groupby('type').sample(frac=0.05)
train_data.reset_index(drop = True, inplace = True)

# test_data = test_data.groupby('type').sample(frac=0.05)
test_data.reset_index(drop = True, inplace = True)

# val_data = val_data.groupby('type').sample(frac=0.05)
val_data.reset_index(drop = True, inplace = True)

In [5]:
train_data.shape[0]

94108

### Preprocessing

#### Train, Validation, Test Split

In [6]:
X_train, X_test, X_val = train_data['posts'], test_data['posts'], val_data['posts']
y_train, y_test, y_val = train_data['i1e0'], test_data['i1e0'], val_data['i1e0']

In [7]:
X_train.shape, X_test.shape

((94108,), (31370,))

In [8]:
# TF-IDF Vectorizer 생성

from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(data, tfidf_vec_fit):
    X_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [9]:
tmp_data = pd.concat([X_train, X_test, X_val])
 
tfidf_vect = TfidfVectorizer(analyzer = 'word', max_features=10000)
tfidf_vect_fit = tfidf_vect.fit(tmp_data)

X_train = vectorize(X_train,tfidf_vect_fit)
X_test = vectorize(X_test,tfidf_vect_fit)
X_val= vectorize(X_val,tfidf_vect_fit)



### Modeling

#### Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [11]:
nb_model = GaussianNB()

In [12]:
nb_model.fit(X_train, y_train)

In [13]:
pred = nb_model.predict(X_test)

In [14]:
f1 = f1_score(y_test, pred)
print('f1 score :', f1)

f1 score : 0.8688524590163933


In [15]:
# Gaussian NB의 경우 별도의 하이퍼라미터가 존재하지 않음
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     15234
           1       0.77      0.99      0.87     16136

    accuracy                           0.85     31370
   macro avg       0.88      0.84      0.84     31370
weighted avg       0.88      0.85      0.84     31370

