In [133]:
from sklearn.feature_extraction.text import *
##CountVectorizer,TfidfVectorizer 사용, 특성추출
#CountVectorizer : 각 텍스트에서 단어 출현 횟수를 카운팅한 벡터
#TfidfVectorizer : TF-IDF라는 값을 사용하여 CountVectorizer의 단점을 보완함
#                   빈도수와 역빈도수를 함께 고려
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [71]:
spam_df = pd.read_csv('./spamData/spam.csv', encoding='latin1')

In [72]:
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [73]:
spam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.8+ KB


### extraction!(reg) => not null, word

In [74]:
#결측치를 빼보자.
spam_df = spam_df[spam_df['v2'].notnull()]
spam_df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [75]:
spam_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 261.2+ KB


In [76]:
#필요없는 컬럼을 지워보자.
spam_df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],
             inplace=True)

In [134]:
#중복값 제거
spam_df.drop_duplicates(subset=['v2'], inplace=True)
spam_df

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final ...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I don t think he goes to usf he lives arou...
...,...,...
5567,spam,This is the nd time we have tried contact u U ...
5568,ham,Will b going to esplanade fr home
5569,ham,Pity was in mood for that So any other suggest...
5570,ham,The guy did some bitching but I acted like i d...


In [77]:
spam_df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [78]:
#정규식을 사용해서 영어제외 전부 없애보자.(특수문자 등)


spam_df['v2'] = spam_df['v2'].apply(lambda x: re.sub(r'[^a-zA-Z]+'," ",x))
spam_df.head()

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final ...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I don t think he goes to usf he lives arou...


### classification => train/test data

In [79]:
#test,train데이터를 나누어보자.

In [92]:
spam_df.iloc[:,1]

0       Go until jurong point crazy Available only in ...
1                                Ok lar Joking wif u oni 
2       Free entry in a wkly comp to win FA Cup final ...
3            U dun say so early hor U c already then say 
4       Nah I don t think he goes to usf he lives arou...
                              ...                        
5567    This is the nd time we have tried contact u U ...
5568                   Will b going to esplanade fr home 
5569    Pity was in mood for that So any other suggest...
5570    The guy did some bitching but I acted like i d...
5571                            Rofl Its true to its name
Name: v2, Length: 5572, dtype: object

In [95]:
X_train, X_test, y_train, y_test = train_test_split(spam_df.iloc[:,1],#문제 
                                                    spam_df.iloc[:,0],#정답 
                                                    test_size=0.4, 
                                                    random_state=42)

In [96]:
print('X_train: ',X_train.shape, 'y_train: ',y_train.shape, 'X_test: ',X_test.shape,'y_test: ',y_test.shape)

X_train:  (3343,) y_train:  (3343,) X_test:  (2229,) y_test:  (2229,)


### stopwords & vectorize => choice(Counter, Tf-idf) : fit(train) —> transform(train,test)

In [135]:
tfidf = TfidfVectorizer(min_df=5,
                        lowercase = True, 
                        stop_words='english')

In [136]:
tfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [137]:
X_train =  tfidf.transform(X_train)
X_test= tfidf.transform(X_test)

In [138]:
print(X_train.shape, X_test.shape)

(3343, 1022) (2229, 1022)


In [139]:
tfidf.inverse_transform(X_train)

[array(['need'], dtype='<U15'),
 array(['text', 'heard', 'haha'], dtype='<U15'),
 array(['love', 'hear'], dtype='<U15'),
 array(['phone', 'pete', 'college'], dtype='<U15'),
 array(['having', 'doing', 'dinner'], dtype='<U15'),
 array(['text', 'need', 'help', 'heart', 'free', 'apply'], dtype='<U15'),
 array(['trying', 'sure', 'hold'], dtype='<U15'),
 array(['ve', 'text', 'phone', 'message', 'age'], dtype='<U15'),
 array(['ve', 'sent', 'send'], dtype='<U15'),
 array(['yar', 'willing', 'wat', 'thk', 'quite', 'lor', 'got', 'darren',
        'bring', 'ask', 'aft'], dtype='<U15'),
 array(['jus'], dtype='<U15'),
 array(['going', 'dat', 'aft'], dtype='<U15'),
 array(['valued', 'ur', 'send', 'person', 'married', 'luv', 'lt', 'love',
        'lose', 'll', 'gt', 'frnds', 'feb', 'dis', 'day', 'comes'],
       dtype='<U15'),
 array(['whats', 'things', 'hope', 'great', 'going', 'fine', 'day',
        'coming', 'busy'], dtype='<U15'),
 array(['text', 'aight', 'address'], dtype='<U15'),
 array(['yester

In [140]:
tfidf.inverse_transform(X_test)

[array(['wife', 'just', 'hw', 'funny', 'choose'], dtype='<U15'),
 array(['thinking', 'think', 'sent', 'school', 'cost', 'contact'],
       dtype='<U15'),
 array(['pobox', 'ls', 'know'], dtype='<U15'),
 array(['text', 'soon', 'ok', 'morning', 'll', 'let', 'know', 'getting'],
       dtype='<U15'),
 array(['www', 'win', 'weekly', 'vouchers', 'ur', 'txt', 'ppmx', 'music',
        'ldew', 'gift', 'free', 'entry', 'draw', 'congratulations', 'com',
        'cd', 'awarded', 'age'], dtype='<U15'),
 array(['text', 'll', 'let', 'know', 'carlos'], dtype='<U15'),
 array(['did'], dtype='<U15'),
 array(['message'], dtype='<U15'),
 array(['walk', 'street', 'road', 'right', 'lt', 'gt', 'cut'], dtype='<U15'),
 array(['shit'], dtype='<U15'),
 array(['tho', 'real', 'hungry'], dtype='<U15'),
 array(['unsubscribe', 'uk', 'tones', 'text', 'stop', 'new', 'hope',
        'help', 'free', 'content'], dtype='<U15'),
 array(['wif', 'lar', 'buying'], dtype='<U15'),
 array(['pick', 'finish', 'come'], dtype='<U15'),


### ml model fit(train) => predict(test) => accuracy check

In [141]:
SA_lr = LogisticRegression(random_state = 0)

In [142]:
SA_lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [143]:
test_predict = SA_lr.predict(X_test)

In [144]:
print('스팸 메일 분석 정확도 : ', round(accuracy_score(y_test, test_predict), 5))

스팸 메일 분석 정확도 :  0.96411


### one mail => predict!!

In [146]:
sp = spam_df['v2'][15]

In [147]:
sp2 = re.sub(r'[^[a-zA-Z]*$', "", sp)
sp2

'XXXMobileMovieClub To use your credit click the WAP link in the next txt message or click here http wap xxxmobilemovieclub com n QJKGIGHJJGCBL'

In [148]:
sp_tfidf = tfidf.transform([sp2])
sp_tfidf

<1x1022 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [149]:
sp2_predict = SA_lr.predict(sp_tfidf)
sp2_predict, sp2_predict[0]

(array(['spam'], dtype=object), 'spam')

In [150]:
if(sp2_predict[0] == 'spam'):
    print(sp2, '==> spam mail')
else:
    print(sp2, '==> ham mail')

XXXMobileMovieClub To use your credit click the WAP link in the next txt message or click here http wap xxxmobilemovieclub com n QJKGIGHJJGCBL ==> spam mail


### 모델 저장

In [151]:
import pickle
import joblib

In [153]:
tfidf_fit = tfidf.fit(spam_df['v2'])
tfidf_fit

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [154]:
saved_model = pickle.dumps(SA_lr)

In [155]:
lr_from_pickle = pickle.loads(saved_model)
lr_from_pickle.predict(sp_tfidf[0])

array(['spam'], dtype=object)

In [156]:
joblib.dump(SA_lr, 'spam_SA_lr.pkl')

['spam_SA_lr.pkl']

In [157]:
lr_from_joblib = joblib.load('spam_SA_lr.pkl')
lr_from_joblib

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [159]:
lr_from_joblib.predict(sp_tfidf[0])

array(['spam'], dtype=object)