# NLP Basic Assignment
## NLP 맛보기 - spam.csv를 가지고 유의미한 해석을 도출해주세요!

In [39]:
# 필요한 모듈 불러오기
import pandas as pd

## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다. 물론 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고, 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다 :)

In [40]:
spam = pd.read_csv('spam.csv')

In [41]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [42]:
spam.v1 = spam.v1.apply(lambda x : 0 if x == 'ham' else 1) # spam을 1 ham 을 0으로 변환해준다.

## Tokenizing


In [43]:
# 특수문자 제거

In [44]:
import re 
def cleanText(x) :
    x_ = re.sub('[^a-zA-Z]', ' ', x).lower() # 특수문자 제거 & 소문자 
    return x_

spam.v2 = spam.v2.apply(cleanText)

In [45]:
test = spam.copy()
test1 = spam.copy()

In [56]:
test

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"
...,...,...
5567,1,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,0,"[b, going, esplanade, fr, home]"
5569,0,"[pity, mood, suggestions]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


In [46]:
import nltk

In [47]:
# 예시 코드 코드
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords') # nltk에 저장되어 있는 불용어를 불러온다.
english_stops = list(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
token = []

for sentence in test.v2 : 
    words = word_tokenize(sentence) 
    words = [word for word in words if word not in english_stops]
    token.append(words)
    
test.v2 = token

In [49]:
test.v2

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, wkly, comp, win, fa, cup, final,...
3           [u, dun, say, early, hor, u, c, already, say]
4          [nah, think, goes, usf, lives, around, though]
                              ...                        
5567    [nd, time, tried, contact, u, u, pound, prize,...
5568                      [b, going, esplanade, fr, home]
5569                            [pity, mood, suggestions]
5570    [guy, bitching, acted, like, interested, buyin...
5571                                   [rofl, true, name]
Name: v2, Length: 5572, dtype: object

## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram, GloVe, FastText가 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [32]:
# 필요한 라이브러리들 올리기
# 사용할 라이브러리들이 import되어있어야 사용할 수 있습니다. 

import gensim
from gensim.models import Word2Vec, FastText

import scipy.stats as st

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
from sklearn.feature_extraction.text import CountVectorizer

# 정상 메일 토큰 빈도수
ct_vector1 = CountVectorizer(max_features= 500, stop_words = 'english', lowercase= False)
corpus_ham = sum(test[test.v1 == 0]['v2'], []) 
ct_ham = ct_vector1.fit_transform(corpus_ham)

In [58]:
count_ham = pd.DataFrame( {'word' : ct_vector1.get_feature_names(),
                           'count' : ct_ham.sum(axis = 0).flat})
count_ham.sort_values('count', ascending = False).head(10) # 상위 10개



Unnamed: 0,word,count
161,gt,318
247,lt,316
300,ok,287
157,got,245
453,ur,241
215,know,236
155,good,233
232,like,232
68,come,228
84,day,216


In [59]:
# spam 메일 토큰 빈도수
ct_vector2 = CountVectorizer(max_features= 500, stop_words = 'english', lowercase= False)
corpus_spam = sum(test[test.v1 == 1]['v2'], []) 
ct_spam = ct_vector2.fit_transform(corpus_spam)

In [60]:
count_spam = pd.DataFrame( {'word' : ct_vector2.get_feature_names(),
                           'count' : ct_spam.sum(axis = 0).flat})
count_spam.sort_values('count', ascending = False).head(10) # 상위 10개



Unnamed: 0,word,count
147,free,228
436,txt,170
446,ur,144
252,mobile,129
411,text,126
392,stop,126
65,claim,113
343,reply,104
486,www,98
320,prize,93


In [63]:
# skip-gram
model = Word2Vec(test[test.v1 == 1]['v2'], min_count=5, vector_size=100, window=2, epochs=200, sg=1) # skip-gram

In [64]:
model.save('w2v_model_spam')

In [65]:

w2v_model_spam = Word2Vec.load('w2v_model_spam')

In [66]:
w2v_model_spam.wv.doesnt_match("free txt ut stop mobile text claim reply www prize".split())

'prize'

In [67]:
w2v_model_spam.wv.most_similar('free')

[('minutes', 0.417833149433136),
 ('tomorrow', 0.378485769033432),
 ('tariffs', 0.3775891959667206),
 ('phones', 0.3738391101360321),
 ('sipix', 0.37359747290611267),
 ('sonyericsson', 0.37050098180770874),
 ('linerental', 0.36251503229141235),
 ('stoptxt', 0.36221882700920105),
 ('bluetooth', 0.3587864339351654),
 ('freemsg', 0.3535122573375702)]

In [71]:
#fast textr
from gensim.models import FastText

# spam
fast_model = FastText(test[test.v1 == 1]['v2'], vector_size=100, window=5, min_count=5, workers=4, sg=1)

In [72]:
fast_model.wv.most_similar("free")

[('freephone', 0.9996470212936401),
 ('phones', 0.9995802044868469),
 ('txts', 0.9995203018188477),
 ('one', 0.9995162487030029),
 ('weeks', 0.9995094537734985),
 ('fone', 0.9995040893554688),
 ('ringtone', 0.9995014071464539),
 ('spree', 0.9994945526123047),
 ('phone', 0.9994620084762573),
 ('new', 0.9994540214538574)]