# 필수 라이브러리 로딩

In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

mpl.rc('font', family='malgun gothic') # 폰트 설정
mpl.rc('axes', unicode_minus=False) # 유니코드에서 음수 부호 설정

# 차트 스타일 설정
sns.set(font="malgun gothic", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc("figure", figsize=(10,8))

warnings.filterwarnings("ignore")

# 데이터로딩

In [3]:
data = pd.read_csv('C:/k_digital/source/Machine Learning with Python/spam.csv')

In [4]:
data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 탐색적 데이터 분석 : EDA

## target : 목표변수 or 종속변수

In [5]:
data['target'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
data['target'].value_counts()

ham     4827
spam     747
Name: target, dtype: int64

- spam : 스팸 문자, ham : 스팸이 아닌 문자
- NlTK(Natural Language Toolkit) : 자연어 처리를 위해 만든 도구(패키지)
- 주요기능 : 말뭉치, 토큰 생성, 형태소 분석, 품사 태깅

## 말뭉치(corpus)
- 자연어 분석을 위해 만든 문서 집합

In [23]:
import nltk 
nltk.download('book', quiet=True)
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [None]:
# gutenberg 말뭉치 : 저작권이 만료된 소설같은 작품들을 가지고 있는 문서 집합
nltk.corpus.gutenberg.fileids()

In [None]:
shakespeare = nltk.corpus.gutenberg.raw('shakespeare-hamlet.txt')
print(shakespeare[:1000])

## 토큰생성 : 문자열을 가장 작은 단위인 토큰으로 나누는 작업

In [None]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(shakespeare[:1000]))

In [None]:
from nltk.tokenize import word_tokenize
print(word_tokenize(shakespeare[50:100]))

In [None]:
from nltk.tokenize import RegexpTokenizer
# \w : 영문자, 숫자, _(밑줄)
re = RegexpTokenizer('[\w]+')
re.tokenize(shakespeare[50:100])

## 형태소(morpheme) 분석

- 형태소 : 의미가 있는 가장 작은 말의 단위
- 형태소 분석 : 단어에서 어근, 접두사, 접미사, 품사 같은 속성을 파악하는 작업
  1. 어간추출 : 단어의 기본형
  2. 원형복원 : 같은 의미를 가진 여러 단어를 통합하는 작업
  3. 품사부착

In [20]:
from nltk.stem import PorterStemmer, LancasterStemmer

st1 = PorterStemmer()
st2 = LancasterStemmer()

words = ['fly', 'files', 'flying', 'flew', 'flown']

print('Porter Stemmer :', [st1.stem(w) for w in words])
print('Lancaster Stemmer :', [st2.stem(w) for w in words])

Porter Stemmer : ['fli', 'file', 'fli', 'flew', 'flown']
Lancaster Stemmer : ['fly', 'fil', 'fly', 'flew', 'flown']


In [21]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

[lm.lemmatize(w, pos='v') for w in words]

['fly', 'file', 'fly', 'fly', 'fly']

In [None]:
nltk.help.upenn_tagset('VB')

from nltk.tag import pos_tag

text = '''Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, 
seemed to unite some of the best blessings of existence'''

tag_list = pos_tag(word_tokenize(text))
tag_list

In [None]:
nonus_list = [t[0] for t in tag_list if t[1] == 'NN']
nonus_list

# 전처리

## 특수문자(기호) 제거

In [8]:
 # 파이썬에 내장된 패키지 string는 문자열을 처리하는 다양한 함수를 제공한다.
import string

In [9]:
# 특수기호 목록 확인
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
sample = data['text'].loc[0]
sample

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [11]:
for i in sample:
    print(i)

G
o
 
u
n
t
i
l
 
j
u
r
o
n
g
 
p
o
i
n
t
,
 
c
r
a
z
y
.
.
 
A
v
a
i
l
a
b
l
e
 
o
n
l
y
 
i
n
 
b
u
g
i
s
 
n
 
g
r
e
a
t
 
w
o
r
l
d
 
l
a
 
e
 
b
u
f
f
e
t
.
.
.
 
C
i
n
e
 
t
h
e
r
e
 
g
o
t
 
a
m
o
r
e
 
w
a
t
.
.
.


In [12]:
for i in sample:
    if i not in string.punctuation:
        print(i)

G
o
 
u
n
t
i
l
 
j
u
r
o
n
g
 
p
o
i
n
t
 
c
r
a
z
y
 
A
v
a
i
l
a
b
l
e
 
o
n
l
y
 
i
n
 
b
u
g
i
s
 
n
 
g
r
e
a
t
 
w
o
r
l
d
 
l
a
 
e
 
b
u
f
f
e
t
 
C
i
n
e
 
t
h
e
r
e
 
g
o
t
 
a
m
o
r
e
 
w
a
t


In [None]:
data.head()

In [13]:
new_text = []
for i in sample:
    if i not in string.punctuation:
        new_text.append(i)
        
new_text

['G',
 'o',
 ' ',
 'u',
 'n',
 't',
 'i',
 'l',
 ' ',
 'j',
 'u',
 'r',
 'o',
 'n',
 'g',
 ' ',
 'p',
 'o',
 'i',
 'n',
 't',
 ' ',
 'c',
 'r',
 'a',
 'z',
 'y',
 ' ',
 'A',
 'v',
 'a',
 'i',
 'l',
 'a',
 'b',
 'l',
 'e',
 ' ',
 'o',
 'n',
 'l',
 'y',
 ' ',
 'i',
 'n',
 ' ',
 'b',
 'u',
 'g',
 'i',
 's',
 ' ',
 'n',
 ' ',
 'g',
 'r',
 'e',
 'a',
 't',
 ' ',
 'w',
 'o',
 'r',
 'l',
 'd',
 ' ',
 'l',
 'a',
 ' ',
 'e',
 ' ',
 'b',
 'u',
 'f',
 'f',
 'e',
 't',
 ' ',
 'C',
 'i',
 'n',
 'e',
 ' ',
 't',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'g',
 'o',
 't',
 ' ',
 'a',
 'm',
 'o',
 'r',
 'e',
 ' ',
 'w',
 'a',
 't']

In [14]:
s = ['a', 'p', 'p', 'l', 'e']
'_'.join(s)

'a_p_p_l_e'

In [15]:
# 문장형태
new_sample = []
for i in sample:
    if i not in string.punctuation:
        new_sample.append(i)
new_sample = ''.join(new_sample)
new_sample

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [16]:
def remove_punc(x):
    new_sample = []
    for i in x:
        if i not in string.punctuation:
            new_sample.append(i)
    new_sample = ''.join(new_sample)
    return new_sample

In [None]:
remove_punc(sample)

In [None]:
remove_punc(data['text'])

In [17]:
data['text'] = data['text'].apply(remove_punc)

In [18]:
data.head()

Unnamed: 0,target,text
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...


## 전처리-불용어(stopword)
- 불용어 : 자연어 분석을 할 때 분석에 의미가 없는 단어
- 불용어 사전 : nltk 

In [24]:
# 불용어 목록 확인
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [26]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [27]:
sample = data['text'].loc[0]
sample

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [28]:
sample.split()

['Go',
 'until',
 'jurong',
 'point',
 'crazy',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'there',
 'got',
 'amore',
 'wat']

In [29]:
new_sample = []
for w in sample.split():
    if w.lower() not in stopwords.words('english'): #불용어가 아니라면
            new_sample.append(w.lower())
new_sample = ' '.join(new_sample)
new_sample

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [30]:
def stop_words(x):
    new_sample = []
    for w in x.split():
        if w.lower() not in stopwords.words('english'): #불용어가 아니라면
            new_sample.append(w.lower())
    new_sample = ' '.join(new_sample)
    return new_sample

In [31]:
data['text'] =data['text'].apply(stop_words)
data['text']

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5569    2nd time tried 2 contact u u 짙750 pound prize ...
5570                          체 b going esplanade fr home
5571                          pity mood soany suggestions
5572    guy bitching acted like id interested buying s...
5573                                       rofl true name
Name: text, Length: 5574, dtype: object

## 전처리 - 정형화처리(target)

In [None]:
sample = pd.Series(['a', 'b', 'c'])
# map()
# zip()
sample.map({'a':'apple', 'b':'banana', 'c':'candy'})

In [32]:
data['target'] = data['target'].map({'spam':1, 'ham':0})
data['target']

0       0
1       0
2       1
3       0
4       0
       ..
5569    1
5570    0
5571    0
5572    0
5573    0
Name: target, Length: 5574, dtype: int64

## 카운트 기반의 벡터화 처리
- 카운트 기반 벡터화 : 문자를 개수 기반으로 벡터화하는 방식
- 데이터 전체에 존재하는 모든 단어들을 사전처럼 모은 다음에 각각의 인덱스를 부여하고, 문장마다 속한 단어가 있는 인덱스를 카운트하는 방식이다.

In [33]:
X = data['text']
y = data['target']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
# 객체 생성
cv = CountVectorizer()
# 모델 학습
cv.fit(X)
# 단어와 인덱스 출력
cv.vocabulary_

{'go': 3769,
 'jurong': 4671,
 'point': 6416,
 'crazy': 2471,
 'available': 1379,
 'bugis': 1848,
 'great': 3866,
 'world': 9171,
 'la': 4831,
 'buffet': 1846,
 'cine': 2186,
 'got': 3826,
 'amore': 1146,
 'wat': 8933,
 'ok': 5978,
 'lar': 4870,
 'joking': 4639,
 'wif': 9066,
 'oni': 6010,
 'free': 3555,
 'entry': 3137,
 'wkly': 9123,
 'comp': 2302,
 'win': 9080,
 'fa': 3274,
 'cup': 2527,
 'final': 3399,
 'tkts': 8366,
 '21st': 433,
 'may': 5318,
 '2005': 420,
 'text': 8201,
 '87121': 840,
 'receive': 6816,
 'questionstd': 6708,
 'txt': 8578,
 'ratetcs': 6760,
 'apply': 1232,
 '08452810075over18s': 71,
 'dun': 2988,
 'say': 7176,
 'early': 3008,
 'hor': 4200,
 'already': 1119,
 'nah': 5665,
 'dont': 2894,
 'think': 8277,
 'goes': 3783,
 'usf': 8727,
 'lives': 5033,
 'around': 1283,
 'though': 8296,
 'freemsg': 3563,
 'hey': 4095,
 'darling': 2591,
 'weeks': 8988,
 'word': 9157,
 'back': 1431,
 'id': 4321,
 'like': 4983,
 'fun': 3630,
 'still': 7843,
 'tb': 8131,
 'xxx': 9296,
 'std': 

In [None]:
#X = cv.transform(X)
print(X)

In [35]:
# 0번때 행의 데이터 확인
data.text[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [36]:
print(cv.vocabulary_['point'])

6416


In [37]:
print(cv.vocabulary_['go'])

3769


# 모델링 및 예측/평가

## 훈련세트와 테스트 세트로 분리

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

## MultinomialNB : 다항 분포에 대한 나이브 베이스 알고리즘
- 정규분포
- 베르누이분포

In [42]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)

ValueError: could not convert string to float: 'ur awarded city break could win 짙200 summer shopping spree every wk txt store 88039skilgmetscs087147403231winawkage16짙150perwksub'

In [43]:
from sklearn.metrics import accuracy_score, cinfusion_matrix

# 정확도
print(accuracy_score_score(y_test, pred))

ImportError: cannot import name 'cinfusion_matrix' from 'sklearn.metrics' (C:\Anaconda3\lib\site-packages\sklearn\metrics\__init__.py)