<a href="https://colab.research.google.com/github/kingjiwoo/nlpbible/blob/main/%EC%8A%A4%ED%8C%B8%ED%96%84%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# 기본 패키지 및 라이브러리
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import csv
import numpy as np

# 전처리 및 모델 관련 패키지 및 라이브러리
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# 성능 검증 관련 패키지 및 라이브러리
from sklearn.metrics import confusion_matrix, classification_report


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


1. 데이터 불러오기

In [2]:
smsdata = open('/content/SMSSpamCollection.txt', encoding='utf8')

2. 데이터 전처리

In [3]:
# 전처리 함수
def preprocessing(text):
    #tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # 텍스트 => 문장 => 단어

    # 불용어 제거
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop] # 불용어에 없으면 토큰으로 포함

    # 3개의 문자보다 작으면 제거
    tokens = [word for word in tokens if len(word) >= 3]

    # 소문자 만들기
    tokens = [word.lower() for word in tokens]

    #lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [11]:
sms_data = []
sms_labels = []
cnt = 0
sencsv_reader = csv.reader(smsdata, delimiter='\t')
for line in sencsv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    sms_data.append(preprocessing(line[1]))

smsdata.close()

In [12]:
# train test split

trainset_size = int(round(len(sms_data)*0.70))
print('The training set size for this classifier is ' + str(trainset_size)+'\n')

The training set size for this classifier is 3898



In [13]:
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array(([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) or el in sms_labels[trainset_size+1:len(sms_labels)])

3. 토큰화

In [14]:
vectorizer2 = TfidfVectorizer(min_df=2, ngram_range=(1,2),
                              stop_words='english', strip_accents='unicode', norm='l2')
X_train = vectorizer2.fit_transform(x_train)
X_test = vectorizer2.transform(x_test)

4. 모델 학습

In [16]:
# Naive Bayes
clf_NB = MultinomialNB().fit(X_train, y_train)
y_predicted_NB = clf_NB.predict(X_test)

# Decision tree
clf_DT = DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_predicted_DT = clf_DT.predict(X_test.toarray())

# Stochastic gradient descent
#clf_SGD = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
clf_SGD = SGDClassifier(alpha=.0001).fit(X_train, y_train)
y_predicted_SGD = clf_SGD.predict(X_test)

# Support Vector Machines
clf_SVM = LinearSVC().fit(X_train, y_train)
y_predicted_SVM = clf_SVM.predict(X_test)

# The Random forest algorithm
clf_RFA = RandomForestClassifier(n_estimators=10)
clf_RFA.fit(X_train, y_train)
y_predicted_RFA = clf_RFA.predict(X_test)

5. 평가

In [17]:
print (' \n confusion_matrix NB \n ')
cm = confusion_matrix(y_test, y_predicted_NB)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_NB))


print (' \n confusion_matrix DT \n ')
cm = confusion_matrix(y_test, y_predicted_DT)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_DT))

print (' \n confusion_matrix SGD \n ')
cm = confusion_matrix(y_test, y_predicted_SGD)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SGD))

print (' \n confusion_matrix SVM\n ')
cm = confusion_matrix(y_test, y_predicted_SVM)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SVM))

print (' \n confusion_matrix RFA \n ')
cm = confusion_matrix(y_test, y_predicted_RFA)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_RFA))

 
 confusion_matrix NB 
 
[[1442    0]
 [  52  176]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1442
        spam       1.00      0.77      0.87       228

    accuracy                           0.97      1670
   macro avg       0.98      0.89      0.93      1670
weighted avg       0.97      0.97      0.97      1670

 
 confusion_matrix DT 
 
[[1413   29]
 [  41  187]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.97      0.98      0.98      1442
        spam       0.87      0.82      0.84       228

    accuracy                           0.96      1670
   macro avg       0.92      0.90      0.91      1670
weighted avg       0.96      0.96      0.96      1670

 
 confusion_matrix SGD 
 
[[1435    7]
 [  21  207]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0