In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("../preprocessing/preprocessed_title.csv", index_col = 0)
df.head()

Unnamed: 0,title,label,processed_title
0,재학생 영어진단평가 신청 및 일정 안내,0,재학생 영어 진단 평가 신청 일정 안내
1,울산대학교 년도 학기 국내교환대학 학점교류 수학 안내 만료,0,울산대학교 국내 교환 대학 학점 교류 수학 안내
2,학년도 여름계절제 학부 강의정보 공유를 위한 설문 실시 안내 만료,0,여름 계절제 학부 강의 정보 공유 설문 실시 안내
3,울산과학기술원 학년도 학기 국내대학 학점교류 수학 안내 만료,0,울산과학기술원 국내 대학 학점 교류 수학 안내
4,한국과학기술원 학년도 학기 국내교환 학점교류 수학 안내 만료,0,한국 과학기술원 국내 교환 학점 교류 수학 안내


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9423 entries, 0 to 9781
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            9423 non-null   object
 1   label            9423 non-null   int64 
 2   processed_title  9423 non-null   object
dtypes: int64(1), object(2)
memory usage: 294.5+ KB


전처리 과정에서 title 과 processed_title에 nan값이 생긴 것으로 보임.

In [4]:
# 결측치 제거
df = df.dropna(subset=['processed_title'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9423 entries, 0 to 9781
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            9423 non-null   object
 1   label            9423 non-null   int64 
 2   processed_title  9423 non-null   object
dtypes: int64(1), object(2)
memory usage: 294.5+ KB


In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_ngram = cv.fit_transform(df['processed_title']).toarray()
print(X_ngram.shape)
print(cv.get_feature_names_out())

(9423, 5883)
['abc' 'abeek' 'aberration' ... '히어로즈' '힐링' '힐스테이트']


In [8]:
# huggingface korean model
model = AutoModel.from_pretrained("klue/bert-base")
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

# GPU 설정
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [9]:
# pre-trained 모델을 이용해 text의 임베딩 추출
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=128) # (input_ids, attention_mask)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # 데이터를 GPU로 이동
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu() # 결과를 CPU로 다시 이동

In [10]:
# 모든 문장에 대해 임베딩 추출
# 텐서를 요소로 한 리스트로 DataFrame 구성
embeddings = [get_sentence_embedding(sentence) for sentence in df['processed_title']] # batch_size = 1이므로 squeeze로 크기=1인 차원을 축소
X_embeddings = pd.DataFrame(embeddings)
X_embeddings = X_embeddings.applymap(lambda x: x.item())

  X_embeddings = X_embeddings.applymap(lambda x: x.item())


In [11]:
X_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.673408,-0.199410,0.383427,1.117176,0.303289,-0.195027,-0.066190,-0.119978,-0.065191,-0.594709,...,0.354824,0.807108,0.089488,0.817117,0.402204,0.998520,0.173136,0.625016,0.963538,-0.257529
1,0.027413,0.044899,0.591579,0.633495,0.857897,-0.215023,0.012942,0.198218,0.016269,-0.319696,...,0.065058,0.789167,-0.353837,0.357686,0.603849,1.260588,0.446223,0.073944,0.943735,0.029457
2,-0.567255,0.157398,0.260768,0.806350,0.662358,0.127007,-0.188282,0.019025,0.423349,0.245828,...,0.058179,-0.112945,0.173841,-0.169813,0.864499,1.420654,-0.165850,-0.515900,0.643656,0.276179
3,-0.137172,0.451919,0.338112,0.485740,1.110070,-0.369155,0.085851,0.232751,-0.017313,-0.354814,...,0.194406,1.122566,-0.485206,0.041345,0.339976,0.792398,0.430585,0.072119,0.753563,0.269146
4,-0.293015,-0.571019,0.606380,0.383335,0.668835,0.064551,0.190809,0.620977,0.034309,-0.486164,...,0.137631,0.944920,-0.431795,0.292280,0.484134,0.776658,0.339515,0.147176,1.031569,0.075861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9418,0.296066,-0.285829,0.447369,1.389521,0.573236,-0.696517,-0.660515,-0.475175,0.648737,0.189506,...,-0.137967,0.221582,-0.449958,-0.342267,0.783056,0.514070,0.089751,0.258334,0.725423,-0.279540
9419,-0.251865,-0.340053,0.304314,0.742105,0.581240,-0.477828,-0.813136,0.066043,0.466995,0.615799,...,-0.069096,0.482000,-0.858295,0.255782,0.177056,0.437270,0.239273,0.591187,0.855653,-0.573120
9420,-1.072919,-0.321692,-0.298808,1.162432,0.269557,-0.513908,0.099718,-0.149292,-0.026442,-0.752580,...,-0.402821,0.788109,-0.401905,0.870525,0.210966,0.002753,-0.079751,0.312045,1.099255,0.087783
9421,-0.504835,-0.181025,0.132825,1.128769,-0.050486,-0.633238,-0.067570,-0.156670,0.595005,-0.234248,...,0.017038,0.287654,-0.061761,-0.561348,0.110210,0.941047,0.217107,0.767245,0.388460,-0.250506


In [12]:
X_combined = np.hstack((X_ngram, X_embeddings)) # 두 개의 넘파이 배열을 열 방향, 즉 가로 방향으로 결합. 두 배열의 행 개수가 같아야 함. 
print(X_combined.shape)
X_combined

(9423, 6651)


array([[ 0.        ,  0.        ,  0.        , ...,  0.62501585,
         0.96353847, -0.25752854],
       [ 0.        ,  0.        ,  0.        , ...,  0.0739438 ,
         0.94373512,  0.02945721],
       [ 0.        ,  0.        ,  0.        , ..., -0.51589978,
         0.64365643,  0.27617919],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.31204501,
         1.09925461,  0.08778293],
       [ 0.        ,  0.        ,  0.        , ...,  0.76724505,
         0.38845959, -0.25050583],
       [ 0.        ,  0.        ,  0.        , ...,  0.28586787,
         0.03364261,  0.03545075]])

## Random Forest

In [13]:
# 데이터 분할
X = X_combined
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

In [14]:
# 랜덤 포레스트 모델 학습
RF = RandomForestClassifier(n_estimators=100, random_state=36)
RF.fit(X_train, y_train)

# 예측 및 평가
y_pred = RF.predict(X_test)

# 평가 지표 출력
print(f'Accuracy: {accuracy_score(y_test, y_pred): .4f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"): .4f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"): .4f}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"): .4f}')
print(classification_report(y_test, y_pred))

Accuracy:  0.9183
Precision:  0.9199
Recall:  0.9183
F1 Score:  0.9183
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       978
           1       0.89      0.94      0.92       907

    accuracy                           0.92      1885
   macro avg       0.92      0.92      0.92      1885
weighted avg       0.92      0.92      0.92      1885



In [15]:
# 랜덤 포레스트 모델 학습 2
RF_2 = RandomForestClassifier(n_estimators=200, max_depth = 50, min_samples_split = 5, min_samples_leaf = 1, random_state=36)
RF_2.fit(X_train, y_train)

# 예측 및 평가
y_pred = RF_2.predict(X_test)

# 평가 지표 출력
print(f'Accuracy: {accuracy_score(y_test, y_pred): .4f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"): .4f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"): .4f}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"): .4f}')
print(classification_report(y_test, y_pred))

Accuracy:  0.9088
Precision:  0.9103
Recall:  0.9088
F1 Score:  0.9088
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       978
           1       0.88      0.93      0.91       907

    accuracy                           0.91      1885
   macro avg       0.91      0.91      0.91      1885
weighted avg       0.91      0.91      0.91      1885



In [16]:
# 랜덤 포레스트 모델 학습 3
RF_3 = RandomForestClassifier(n_estimators=300, max_depth = None, min_samples_split = 2, min_samples_leaf = 1,
                                 max_features = 'log2', random_state=36)
RF_3.fit(X_train, y_train)

# 예측 및 평가
y_pred = RF_3.predict(X_test)

# 평가 지표 출력
print(f'Accuracy: {accuracy_score(y_test, y_pred): .4f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"): .4f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"): .4f}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"): .4f}')
print(classification_report(y_test, y_pred))

Accuracy:  0.9103
Precision:  0.9129
Recall:  0.9103
F1 Score:  0.9103
              precision    recall  f1-score   support

           0       0.95      0.88      0.91       978
           1       0.88      0.95      0.91       907

    accuracy                           0.91      1885
   macro avg       0.91      0.91      0.91      1885
weighted avg       0.91      0.91      0.91      1885



## Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

**Multinomial Naive Bayes**

In [18]:
# Multinomial Naive Bayes

# 데이터에 음수가 없도록 0과 1사이로 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# 모델 훈련
MNB = MultinomialNB()
MNB.fit(X_train_scaled, y_train)

# 예측
y_pred = MNB.predict(X_test_scaled)

# 평가 지표
print(f'Accuracy: {accuracy_score(y_test, y_pred): .4f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"): .4f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"): .4f}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"): .4f}')
print(classification_report(y_test, y_pred))

Accuracy:  0.9008
Precision:  0.9026
Recall:  0.9008
F1 Score:  0.9008
              precision    recall  f1-score   support

           0       0.93      0.87      0.90       978
           1       0.87      0.93      0.90       907

    accuracy                           0.90      1885
   macro avg       0.90      0.90      0.90      1885
weighted avg       0.90      0.90      0.90      1885



**Bernoulli Naive Bayes**

In [19]:
# Bernoulli Naive Bayes

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# 모델 훈련
BNB = BernoulliNB()
BNB.fit(X_train_scaled, y_train)

# 예측
y_pred = BNB.predict(X_test_scaled)

# 평가 지표
print(f'Accuracy: {accuracy_score(y_test, y_pred): .4f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"): .4f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"): .4f}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"): .4f}')
print(classification_report(y_test, y_pred))

Accuracy:  0.9088
Precision:  0.9094
Recall:  0.9088
F1 Score:  0.9088
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       978
           1       0.89      0.92      0.91       907

    accuracy                           0.91      1885
   macro avg       0.91      0.91      0.91      1885
weighted avg       0.91      0.91      0.91      1885



## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
# X_embeddings만 활용

# 모델 학습
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train, y_train)

# 예측 및 평가
y_pred = LR.predict(X_test)

# 평가 지표 출력
print(f'Accuracy: {accuracy_score(y_test, y_pred): .4f}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted"): .4f}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted"): .4f}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"): .4f}')
print(classification_report(y_test, y_pred))

Accuracy:  0.9321
Precision:  0.9323
Recall:  0.9321
F1 Score:  0.9321
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       978
           1       0.92      0.94      0.93       907

    accuracy                           0.93      1885
   macro avg       0.93      0.93      0.93      1885
weighted avg       0.93      0.93      0.93      1885

