### SMS Spam 분류

In [113]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [114]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- Data preprocessing

In [115]:
# Selection
df = df[['v1', 'v2']]
df.head(3)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [116]:
# missing value
df.isna().sum().sum()

0

In [117]:
# Duplicate data check
print(df.shape, df['v2'].nunique())

(5572, 2) 5169


In [118]:
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [119]:
# Labeling encoding, ['ham', 'spam'] --> [0, 1]
df['v1'] = df['v1'].replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [120]:
# Ham / Spam 분포
df['v1'].value_counts() 


v1
0    4516
1     653
Name: count, dtype: int64

- Text preprocessing

In [121]:
# 구둣점, 숫자 제거
df['v2'] = df['v2'].str.replace('[^a-zA-Z]', ' ', regex=True)
df.head(3)

Unnamed: 0,v1,v2
0,0,Go until jurong point crazy Available only ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in a wkly comp to win FA Cup fina...


- 데이터셋 분리

In [122]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['v2'].values, df['v1'].values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

In [123]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['v2'].values, df['v1'].values, stratify=df['v1'].values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

In [124]:
X_train

array(['Then any special there ',
       'You only hate me  You can call any but you didnt accept even a single call of mine  Or even you messaged',
       'Enjoy the showers of possessiveness poured on u by ur loved ones  bcoz in this world of lies  it is a golden gift to be loved truly  ',
       ..., 'Dunno da next show aft   is      Toa payoh got     ',
       'I had been hoping i would not have to send you this message  My rent is due and i dont have enough for it  My reserves are completely gone  Its a loan i need and was hoping you could her  The balance is   lt   gt    Is there a way i could get that from you  till mid march when i hope to pay back ',
       'You in your room  I need a few'], dtype=object)

In [125]:
y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

- Text Encoding

In [126]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [127]:
# 이렇게 하면 절대로 안됨.
# train/test dataset에서 사용되는 단어가 다름
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.fit_transform(X_test)
X_train_cv.shape, X_test_cv.shape

((4135, 6494), (1034, 2866))

In [128]:
X_train_cv

<4135x6494 sparse matrix of type '<class 'numpy.int64'>'
	with 30145 stored elements in Compressed Sparse Row format>

In [129]:
cv.fit(X_train)
# 분석
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((4135, 6494), (1034, 6494))

- 학습 및 평가

In [130]:
# 분류기
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)

In [131]:
lrc.fit(X_train_cv, y_train)

In [132]:
lrc.score(X_test_cv, y_test)

0.9709864603481625

- Bi-gram

In [133]:
cv2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
cv2.fit(X_train)
X_train_cv2 = cv2.transform(X_train)
X_test_cv2 = cv2.transform(X_test)
X_train_cv2.shape, X_test_cv2.shape

((4135, 28822), (1034, 28822))

In [134]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_cv2, y_train)
lrc.score(X_test_cv2, y_test)

0.9680851063829787

- TfidfVectorizer

In [135]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(stop_words='english')
tv.fit(X_train)
X_train_tv = tv.transform(X_train)
X_test_tv = tv.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((4135, 6494), (1034, 6494))

In [136]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_tv, y_train)
lrc.score(X_test_tv, y_test)

0.9458413926499033

- TfidfVectorizer + Bi-gram

In [137]:
tv2 = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tv2.fit(X_train)
X_train_tv2 = tv2.transform(X_train)
X_test_tv2 = tv2.transform(X_test)
X_train_tv2.shape, X_test_tv2.shape

((4135, 28822), (1034, 28822))

In [138]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_tv2, y_train)
lrc.score(X_test_tv2, y_test)

0.9410058027079303

- 실제 데이터로 검증

In [139]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [140]:
sms = [df.v2[0], df.v2[2]]
sms

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [141]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_cv, y_train)

In [142]:
import re
sms = map(lambda x: re.sub('[^A-Za-z]', ' ', x), sms)

In [143]:
# feature 변환
sms_cv = cv.transform(sms)
sms_cv.shape

(2, 6494)

In [144]:
# 예측
lrc.predict(sms_cv)

array([0, 1], dtype=int64)

- 베스트 파라메터 찾기

In [145]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [146]:
pipeline = Pipeline([('CVECT', cv), ('LRC', lrc)])
lrc = LogisticRegression(random_state=2023)
params = {
    'CVECT__ngram_range': [(1,1), (1,2)],
    'LRC__C': [0.1, 1, 10]
}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 12.5 s
Wall time: 3.61 s


In [147]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'LRC__C': 10}

In [149]:
pipeline = Pipeline([('CVECT', cv), ('LRC', lrc)])
lrc = LogisticRegression(random_state=2023)
params = {
    'CVECT__ngram_range': [(1,1), (1,2)],
    'LRC__C': [5, 8, 10, 12, 20]
}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 24.4 s
Wall time: 7.03 s


In [150]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'LRC__C': 8}

In [151]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.9738878143133463