In [22]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [23]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import re
from konlpy.tag import Okt,Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score

In [27]:
train      = pd.read_csv("/content/drive/MyDrive/train_data.csv")
test       = pd.read_csv("/content/drive/MyDrive/test_data.csv")
submission = pd.read_csv("/content/drive/MyDrive/sample_submission.csv")
topic_dict = pd.read_csv("/content/drive/MyDrive/topic_dict.csv")

In [28]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt()
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))

In [29]:
# tf-idf를 이용한 벡터화
def split(text):
    tokens_ko = text.split()
    return tokens_ko

tfidf_vect = TfidfVectorizer(tokenizer=split)
tfidf_vect.fit(train['title'])
tfidf_matrix_train = tfidf_vect.transform(train['title'])



In [30]:
# train/valid 데이터 셋 나누기.
def split_dataset(tfidf,df):
    X_data = tfidf
    y_data = df['topic_idx']

    # stratify=y_data Stratified 기반 분할, train 데이터의 30%를 평가 데이터 셋으로 사용. (70% 데이터 학습에 사용)
    X_train, X_test, y_train, y_test = \
    train_test_split(X_data, y_data, test_size=0.3, random_state=42, stratify=y_data)

    
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = split_dataset(tfidf_matrix_train,train)

In [32]:
# 분류 모델 학습
model = MultinomialNB(alpha =0.5)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 정확도 평가
accuracy = (y_pred == y_test).mean()
print("정확도:", accuracy)

정확도: 0.8375556691246259


## Predict

In [33]:
test['title'] = test['title'].apply(lambda x : func(x)) 
tfidf_matrix_test = tfidf_vect.transform(test['title'])

In [34]:
tmp_pred = model.predict(tfidf_matrix_test)
tmp_pred

array([2, 3, 2, ..., 2, 2, 2])

In [35]:
submission.topic_idx = tmp_pred
submission.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [36]:
submission.to_csv('/content/drive/MyDrive/submission Okt MultinomialNB.csv', index = False)