In [2]:
# SMSSpamCollection 경로 지정 위해 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd /content/drive/MyDrive

Mounted at /content/drive
/content/drive/MyDrive


**텍스트 분류 프로세스:**
1. 데이터 준비
2. 데이터 전처리 (불용어 제거, 특수기호 제거, Normalization; 주로 NLTK 사용)
3. 특징값 추출 (BoW, TF-IDF vectorizer)
4. 학습 (여러가지 모델 사용)
5. 평가 (Precision, Recall, F-1 Score)

In [3]:
!pip install np   #환경설정

import nltk   # a suite of libraries and programs for NLP
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import np   # np = numpy + handy tools

#환경설정
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

"""
실행시 주의사항: SMSSpamCollection의 경로를 지정해주세요.
"""

smsdata = open('SMSSpamCollection', encoding='utf8') # "drive/MyDrive/SMSSpamCollection"

Collecting np
  Downloading https://files.pythonhosted.org/packages/40/7d/749666e5a9976dcbc4d16d487bbe571efc6bbf4cdf3f4620c0ccc52b57ef/np-1.0.2.tar.gz
Building wheels for collected packages: np
  Building wheel for np (setup.py) ... [?25l[?25hdone
  Created wheel for np: filename=np-1.0.2-cp37-none-any.whl size=13652 sha256=f06dbeb08e9d2419095cec9264843d67ab70b4c59aabccc65c70f40a633eb108
  Stored in directory: /root/.cache/pip/wheels/2b/df/57/f40bef951382112d9c644b3ec6d713ceb200ea7c15c074de42
Successfully built np
Installing collected packages: np
Successfully installed np-1.0.2
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
def preprocessing(text):   #Preprocessing
    # tokenize into words (text -> sentences -> words)
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
   
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    
    # lemmatize (return the base or dictionary form of a word; not just cropping)
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    preprocessed_text= ' '.join(tokens)
    return preprocessed_text

In [5]:
sms_data = []
sms_labels = []
cnt = 0   # ??
sencsv_reader = csv.reader(smsdata,delimiter='\t')
for line in sencsv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    sms_data.append(preprocessing(line[1]))

smsdata.close()

# 전처리 끝

In [15]:
trainset_size = int(round(len(sms_data)*0.70))  # Split Train data and Test data. 굳이 이렇게 요란하게 해야하나? 걍 int(len(sms_data)*0.70)
print('The training set size for this classifier is ' + str(trainset_size) + '\n')

x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array(([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) or el in sms_labels[trainset_size+1:len(sms_labels)])

# print(x_train[:3])
# print(y_train[:3])

The training set size for this classifier is 3900



In [26]:
# TF-IDF vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

# collection of raw documents -> matrix of TF-IDF features
vectorizer2 = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2')   # alphabet 'l'

# Learn vocabulary and idf, return document-term matrix
X_train = vectorizer2.fit_transform(x_train)

# Transform documents to document-term matrix
X_test = vectorizer2.transform(x_test)

print(X_train[0][0], end='\n\n')
print(X_test[0][0])

  (0, 5594)	0.24661726450021537
  (0, 2307)	0.20644946321360894
  (0, 1219)	0.35808670056556285
  (0, 1024)	0.406938447032537
  (0, 5794)	0.29225456822673795
  (0, 2345)	0.2453064894905327
  (0, 1025)	0.36473744198446423
  (0, 768)	0.330214154444826
  (0, 1443)	0.3469726843786785
  (0, 3967)	0.3077023231509078

  (0, 3727)	1.0


In [None]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train, y_train)
y_predicted_NB = clf_NB.predict(X_test)

In [None]:
# Decision tree (takes long)
from sklearn import tree
clf_DT = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_predicted_DT = clf_DT.predict(X_test.toarray())

In [None]:
# Stochastic gradient descent
from sklearn.linear_model import SGDClassifier
#clf_SGD = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
clf_SGD = SGDClassifier(alpha=.0001).fit(X_train, y_train)
y_predicted_SGD = clf_SGD.predict(X_test)

In [None]:
# Support Vector Machines
from sklearn.svm import LinearSVC
clf_SVM = LinearSVC().fit(X_train, y_train)
y_predicted_SVM = clf_SVM.predict(X_test)

In [None]:
# The Random forest algorithm
from sklearn.ensemble import RandomForestClassifier
clf_RFA = RandomForestClassifier(n_estimators=10)
clf_RFA.fit(X_train, y_train)
y_predicted_RFA = clf_RFA.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
print (' \n confusion_matrix NB \n ')
cm = confusion_matrix(y_test, y_predicted_NB)
print (cm)
print ('\n Here is the classification report:\n')
print (classification_report(y_test, y_predicted_NB))

 
 confusion_matrix NB 
 
[[1443    0]
 [  52  176]]

 Here is the classification report:

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1443
        spam       1.00      0.77      0.87       228

    accuracy                           0.97      1671
   macro avg       0.98      0.89      0.93      1671
weighted avg       0.97      0.97      0.97      1671



In [None]:
print (' \n confusion_matrix DT \n ')
cm = confusion_matrix(y_test, y_predicted_DT)
print (cm)
print ('\n Here is the classification report:\n')
print (classification_report(y_test, y_predicted_DT))

 
 confusion_matrix DT 
 
[[1413   30]
 [  41  187]]

 Here is the classification report:

              precision    recall  f1-score   support

         ham       0.97      0.98      0.98      1443
        spam       0.86      0.82      0.84       228

    accuracy                           0.96      1671
   macro avg       0.92      0.90      0.91      1671
weighted avg       0.96      0.96      0.96      1671



In [None]:
print (' \n confusion_matrix SGD \n ')
cm = confusion_matrix(y_test, y_predicted_SGD)
print (cm)
print ('\n Here is the classification report:\n')
print (classification_report(y_test, y_predicted_SGD))

 
 confusion_matrix SGD 
 
[[1436    7]
 [  21  207]]

 Here is the classification report:

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1443
        spam       0.97      0.91      0.94       228

    accuracy                           0.98      1671
   macro avg       0.98      0.95      0.96      1671
weighted avg       0.98      0.98      0.98      1671



In [None]:
print (' \n confusion_matrix SVM\n ')
cm = confusion_matrix(y_test, y_predicted_SVM)
print (cm)
print ('\n Here is the classification report:\n')
print (classification_report(y_test, y_predicted_SVM))

 
 confusion_matrix SVM
 
[[1437    6]
 [  24  204]]

 Here is the classification report:

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1443
        spam       0.97      0.89      0.93       228

    accuracy                           0.98      1671
   macro avg       0.98      0.95      0.96      1671
weighted avg       0.98      0.98      0.98      1671



In [None]:
print (' \n confusion_matrix RFA \n ')
cm = confusion_matrix(y_test, y_predicted_RFA)
print (cm)
print ('\n Here is the classification report:\n')
print (classification_report(y_test, y_predicted_RFA))

 
 confusion_matrix RFA 
 
[[1443    0]
 [  44  184]]

 Here is the classification report:

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1443
        spam       1.00      0.81      0.89       228

    accuracy                           0.97      1671
   macro avg       0.99      0.90      0.94      1671
weighted avg       0.97      0.97      0.97      1671

