### SMS Spam 분류

In [53]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [54]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- 데이터 처리

In [55]:
# Selection
df = df[['v1', 'v2']]

In [56]:
# 결측치 확인
df.isna().sum().sum()

0

In [57]:
# 중복 데이터 확인
print(df.shape, df.v2.nunique())

(5572, 2) 5169


In [58]:
# 중복 데이터 제거
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [59]:
# Label Encoding, ['ham', 'spam']  --> [0, 1] / LabelEncoder 사용 가능
df.v1 = df.v1.replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [60]:
# Ham/Spam 분포 > 두개의 분포가 불균일 > 안 좋음.
df.v1.value_counts()

0    4516
1     653
Name: v1, dtype: int64

- 텍스트 전처리

In [61]:
# 구둣점, 숫자 제거 > sklearn에서 white space 처리 함.
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ', regex=True)
df.v2[0]

'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

- 데이터셋 분리

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

In [63]:
X_train

array(['Then any special there ',
       'You only hate me  You can call any but you didnt accept even a single call of mine  Or even you messaged',
       'Enjoy the showers of possessiveness poured on u by ur loved ones  bcoz in this world of lies  it is a golden gift to be loved truly  ',
       ..., 'Dunno da next show aft   is      Toa payoh got     ',
       'I had been hoping i would not have to send you this message  My rent is due and i dont have enough for it  My reserves are completely gone  Its a loan i need and was hoping you could her  The balance is   lt   gt    Is there a way i could get that from you  till mid march when i hope to pay back ',
       'You in your room  I need a few'], dtype=object)

- Text Encoding

In [64]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english')

In [65]:
# 이렇게 하면 절대로 안됨.
# train/test dataset에서 사용되는 단어가 다름 > indexing이 달라짐. ex) train explain -> 100 <- free test 
X_train_cv = cvect.fit_transform(X_train)   # X_train을 중심으로 단어 숙지 + indexing
X_test_cv = cvect.fit_transform(X_test)     # X_test을 중심으로 단어 숙지 + indexing
X_train_cv.shape, X_test_cv.shape
# 6494, 2866 단어 셋이 다름.?

((4135, 6494), (1034, 2866))

In [66]:
# 변환된 단어 세트가 train과 test가 동일해야 함.
cvect.fit(X_train)      # X_train을 기준으로 나눠야 되는 단어를 먼저 숙지
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape
# 6494 단어 셋이 같음.

((4135, 6494), (1034, 6494))

In [46]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

- 학습 및 평가

In [67]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)

In [68]:
lrc.fit(X_train_cv, y_train)

In [69]:
lrc.score(X_test_cv, y_test)

0.9709864603481625

- Bigram

In [27]:
cvect2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
cvect2.fit(X_train)
X_train_cv2 = cvect2.transform(X_train)
X_test_cv2 = cvect2.transform(X_test)
X_train_cv2.shape, X_test_cv2.shape
# 4135(train의 문장수) + 1034(test의 문장수) = 5169(문장수)

((4135, 28822), (1034, 28822))

In [28]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_cv2, y_train)
lrc.score(X_test_cv2, y_test)
# 안 좋은 경우도 생김

0.9680851063829787

- TfidfVectorizer

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((4135, 6494), (1034, 6494))

In [30]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_tv, y_train)
lrc.score(X_test_tv, y_test)

0.9458413926499033

- TfidfVectorizer + Bigram

In [31]:
tvect2 = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tvect2.fit(X_train)
X_train_tv2 = tvect2.transform(X_train)
X_test_tv2 = tvect2.transform(X_test)
X_train_tv2.shape, X_test_tv2.shape

((4135, 28822), (1034, 28822))

In [32]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_tv2, y_train)
lrc.score(X_test_tv2, y_test)

0.9410058027079303

- 실제 데이터로 검증

In [35]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [71]:
sms = [df.v2[0], df.v2[2]]
sms

['Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   ',
 'Free entry in   a wkly comp to win FA Cup final tkts   st May       Text FA to       to receive entry question std txt rate T C s apply            over   s']

In [70]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_cv, y_train)

In [72]:
# 텍스트 전처리, 숫자와 구둣점 제거
import re
sms = map(lambda x : re.sub('[^A-Za-z]', ' ', x), sms)

In [73]:
sms

<map at 0x28783269150>

In [74]:
# feature 변환
sms_cv = cvect.transform(sms)
sms_cv.shape

(2, 6494)

In [75]:
# 예측
lrc.predict(sms_cv)

array([0, 1], dtype=int64)

- 베스트 파라메터 찾기

In [76]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [77]:
pipeline = Pipeline([('CVECT', cvect), ('LRC', lrc)])       # CVECT의 ouput이 LRC의 input / 생짜로 다시 함.
lrc = LogisticRegression(random_state=2023)
params = {
    'CVECT__ngram_range' : [(1, 1), (1, 2)],        # unigram, bigram
    'LRC__C': [0.1, 1, 10]
}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)       # X_train을 그냥 사용하여 pipeline의 CVECT에 적용

CPU times: total: 13.2 s
Wall time: 3.83 s


In [78]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'LRC__C': 10}

In [79]:
pipeline = Pipeline([('CVECT', cvect), ('LRC', lrc)])       
lrc = LogisticRegression(random_state=2023)
params = {
    'CVECT__ngram_range' : [(1, 1), (1, 2)],        # unigram, bigram
    'LRC__C': [5, 8, 10, 12, 20]
}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 24.7 s
Wall time: 7.15 s


In [80]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'LRC__C': 8}

In [81]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.9738878143133463