In [1]:
from sklearn.feature_extraction.text import *
##CountVectorizer,TfidfVectorizer 사용, 특성추출
#CountVectorizer : 각 텍스트에서 단어 출현 횟수를 카운팅한 벡터
#TfidfVectorizer : TF-IDF라는 값을 사용하여 CountVectorizer의 단점을 보완함
#                   빈도수와 역빈도수를 함께 고려
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
spam_df = pd.read_csv('./spamData/spam.csv', encoding='latin1')

In [3]:
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### extraction!(reg) => not null, word

In [10]:
#결측치를 빼보자.
spam_df = spam_df[spam_df['v2'].notnull()]
spam_df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#필요없는 컬럼을 지워보자.
spam_df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],
             inplace=True)

In [9]:
#중복값 제거
spam_df.drop_duplicates(subset=['v2'],inplace=True)
spam_df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#정규식을 사용해서 영어제외 전부 없애보자.(특수문자 등)


spam_df['v2'] = spam_df['v2'].apply(lambda x: re.sub(r'[^a-zA-Z]+'," ",x))
spam_df.head()

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final ...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I don t think he goes to usf he lives arou...


### classification => train/test data

In [12]:
#test,train데이터를 나누어보자.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(spam_df.iloc[:,1],#문제v2 
                                                    spam_df.iloc[:,0],#정답v1 
                                                    test_size=0.4, 
                                                    random_state=42)

In [14]:
print('X_train: ',X_train.shape, 'y_train: ',y_train.shape, 'X_test: ',X_test.shape,'y_test: ',y_test.shape)

X_train:  (3101,) y_train:  (3101,) X_test:  (2068,) y_test:  (2068,)


### stopwords & vectorize => choice(Counter, Tf-idf) : fit(train) —> transform(train,test)

In [15]:
tfidf = TfidfVectorizer(min_df=5,
                       lowercase=True,
                       stop_words='english')

In [16]:
#fit 시켜보자.
tfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [17]:
#문제들을 transform해보자.
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

In [18]:
print(X_train.shape, X_test.shape)

(3101, 928) (2068, 928)


In [19]:
tfidf.inverse_transform(X_test)

[array(['download', 'did'], dtype='<U15'),
 array(['wid', 'wat', 'wana', 'ur', 'swt', 'smile', 'red', 'plz', 'pass',
        'orange', 'miss', 'luv', 'hot', 'face', 'dis', 'blue'],
       dtype='<U15'),
 array(['ok'], dtype='<U15'),
 array(['later', 'ill', 'film'], dtype='<U15'),
 array(['sorry', 'phone', 'ok', 'left', 'date'], dtype='<U15'),
 array(['urgent', 'rate', 'prize', 'nd', 'national', 'mobile', 'contact',
        'caller', 'bt', 'box', 'bonus', 'awarded', 'attempt'], dtype='<U15'),
 array(['school', 'plus', 'offer', 'got'], dtype='<U15'),
 array(['week', 'ringtone', 'new', 'mobile', 'gr', 'direct', 'club'],
       dtype='<U15'),
 array(['week', 'know', 'going', 'don', 'da'], dtype='<U15'),
 array(['started', 'office', 'da'], dtype='<U15'),
 array(['waiting', 'ur', 'test', 'tell', 'reply', 'number', 'lt', 'gt',
        'face'], dtype='<U15'),
 array(['set', 'big'], dtype='<U15'),
 array(['wow', 'win', 'uk', 'txt', 'tickets', 'info', 'club', 'book'],
       dtype='<U15'),
 arra

### ml model fit(train) => predict(test) => accuracy check

In [20]:
#정확도 분석을 해보자.
SA_lr = LogisticRegression(random_state= 0)

In [21]:
#train문제(x)와 정답(y)을 fit시켜보자.
SA_lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
#test문제의 예측값을 변수에 넣어보자.
test_predict = SA_lr.predict(X_test)

In [23]:
#test정답에 대한 문제예측의 정확도를 print해보자.
print('스팸 메일 분석 정확도', round(accuracy_score(y_test, test_predict), 5))

스팸 메일 분석 정확도 0.96277


### 하나의 메일에 대해 스팸인지 아닌지 판별해보자.
### one mail => predict!!

In [34]:
#v2컬럼의 15번째 행을 변수에 넣어 보자.
sp = spam_df['v2'][15]
sp

'XXXMobileMovieClub To use your credit click the WAP link in the next txt message or click here http wap xxxmobilemovieclub com n QJKGIGHJJGCBL'

In [26]:
#정규식을 사용하여 영어외 문자 제거.
sp2 = re.sub(r'[^[a-zA-Z]*$', "", sp)
sp2

'XXXMobileMovieClub To use your credit click the WAP link in the next txt message or click here http wap xxxmobilemovieclub com n QJKGIGHJJGCBL'

In [27]:
#transform시키자.
sp_tfidf = tfidf.transform([sp2])
sp_tfidf

<1x928 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [28]:
#스팸인지 아닌지 예측
sp2_predict = SA_lr.predict(sp_tfidf)
sp2_predict, sp2_predict[0]

(array(['spam'], dtype=object), 'spam')

In [29]:
if(sp2_predict[0] == 'spam'):
    print(sp2, '==> spam mail')
else:
    print(sp2, '==> ham mail')

XXXMobileMovieClub To use your credit click the WAP link in the next txt message or click here http wap xxxmobilemovieclub com n QJKGIGHJJGCBL ==> spam mail


### 모델 저장

In [36]:
import pickle
import joblib

In [37]:
tfidf_fit = tfidf.fit(spam_df['v2'])
tfidf_fit

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [38]:
saved_model = pickle.dumps(SA_lr)

In [39]:
lr_from_pickle = pickle.loads(saved_model)
lr_from_pickle.predict(sp_tfidf[0])

array(['spam'], dtype=object)

In [40]:
joblib.dump(SA_lr, 'spam_SA_lr.pkl')

['spam_SA_lr.pkl']

In [41]:
lr_from_joblib = joblib.load('spam_SA_lr.pkl')
lr_from_joblib

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
lr_from_joblib.predict(sp_tfidf[0])

array(['spam'], dtype=object)