##  TF-IDF를 적용하고 나이브베이즈 분류기로 예측

### CountVectorizer
- stop_words : 문자열 {‘english’}, 리스트 또는 None (디폴트)
- stop words 목록.‘english’이면 영어용 스탑 워드 사용.
- analyzer : 문자열 {‘word’, ‘char’, ‘char_wb’} 또는 함수
- 단어 n-그램, 문자 n-그램, 단어 내의 문자 n-그램
- tokenizer : 함수 또는 None (디폴트) 토큰 생성 함수 .
- token_pattern : string 토큰 정의용 정규 표현식
- ngram_range : (min_n, max_n) 튜플 n-그램 범위
- max_df : 정수 또는 [0.0, 1.0] 사이의 실수. 디폴트 1, 단어장에 포함되기 위한 최대 빈도
- min_df : 정수 또는 [0.0, 1.0] 사이의 실수. 디폴트 1, 단어장에 포함되기 위한 최소 빈도
- vocabulary : 사전이나 리스트단어장

In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("data/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
import re

def clean_text(phrase):
    phrase = phrase.replace("doesn't ", "does not ")
    phrase = phrase.replace("ca n't ", "can not ")
    phrase = phrase.replace(" n't ", " not ")
    phrase = re.sub("[a]{1,15}", 'a', phrase) # baaaaaaaaaad 와 같은 단어를 bad로 변경해 준다.
    phrase = re.sub("[o]{2,15}", 'oo', phrase) # gooooooood 과 같은 단어를 good으로 변경해 준다.

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [7]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


## Stemming

In [8]:
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

stemmer = SnowballStemmer('english')
stemmer

def stem_phrase(phrase):
    stemmed_words = [stemmer.stem(w) for w in phrase.split(" ")]
    stemmed_phrase = " ".join(stemmed_words)
    
    return stemmed_phrase

tqdm.pandas(desc="Train Stemming...")
train["Phrase"].progress_apply(stem_phrase).head()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

tqdm.pandas(desc="Test Stemming...")
test["Phrase"].progress_apply(stem_phrase).head()

Train Stemming...: 100%|██████████| 156060/156060 [00:32<00:00, 4836.12it/s]
Test Stemming...:   1%|          | 575/66292 [00:00<00:11, 5749.54it/s]

(156060, 4)


Test Stemming...: 100%|██████████| 66292/66292 [00:13<00:00, 4893.53it/s]


PhraseId
156061    an intermitt pleas but most routin effort .
156062      an intermitt pleas but most routin effort
156063                                             an
156064         intermitt pleas but most routin effort
156065                intermitt pleas but most routin
Name: Phrase, dtype: object

### Vectorize phrases

In [9]:
# Tf-idf 벡터라이즈로 바꿔본다.
# word로 쪼개는 것과 캐릭터로 쪼개는 것을 둘 다 써서 합치는 게 점수가 더 좋아진다.

# n그램이 늘어나면 맥스피처도 늘려주는 게 좋다. 하이퍼파라메터 튜닝을 해서 찾아낸다.
# 1,1은 24개
# 넘파이의 베이스가 scipy다. 수학적 연산을 쓰고 싶을 때 사용한다.


from sklearn.feature_extraction.text import CountVectorizer
import nltk

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# binary=True/False
# lowercase=True/False
# ngram_range=(1, 1)
# stop_words=None

# vectorizer = CountVectorizer(max_features=1000)
vectorizer = CountVectorizer(max_features=1000000, min_df=2, ngram_range=(1, 3), tokenizer=nltk.word_tokenize)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x10f1260d0>, vocabulary=None)

In [10]:
vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x10f1260d0>, vocabulary=None)

In [11]:
vectorizer.vocabulary_.get('good')

87308

In [12]:
vectorizer.vocabulary_.get('bad')

41567

In [13]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x213510 sparse matrix of type '<class 'numpy.int64'>'
	with 2847598 stored elements in Compressed Sparse Row format>

In [14]:
# 벡터화 한 단어의 일부를 출력해 본다.
columns = vectorizer.get_feature_names()
pd.DataFrame(X_train[:100].toarray(), columns=columns).head()

Unnamed: 0,!,! ',! '',! -rrb-,! ?,! ? ',! ? -rrb-,#,# 3,# 9,...,"zoolander ,","zoolander , which",zoom,zucker,zucker brothers\/abrahams,zucker brothers\/abrahams films,zwick,"zwick ,","zwick , is",zzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_train.shape

(156060, 213510)

In [16]:
X_test = vectorizer.transform(test["Phrase"])
X_test.shape

(66292, 213510)

In [17]:
X_test.shape

(66292, 213510)

In [18]:
# 빈도수를 카운트에 대한 값을 문서 빈도에 대한 반전 값으로 변환한다. 
# TF-IDF (Term Frequency -- Inverse Document Frequency)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [19]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_train_tfidf.shape

(156060, 213510)

In [20]:
X_test_tfidf = tfidf_transformer.fit_transform(X_test)
X_test_tfidf.shape

(66292, 213510)

In [21]:
# 레이블의 불균형도 많다. 암환자 예측하기에서 모든 테스트데이터가 암환자가 아니기 때문에 쪼개서 맞추도록 한다.
# 이것을 자동으로 맞추도록 한다. 그래서 캐릭터와 단어별로 나누고 다시 이를 합쳐준다.

sentence_ids = train["SentenceId"]

print(sentence_ids.shape)

(156060,)


In [22]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

In [23]:
y_train.shape

(156060,)

## Score

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(156060, 213510)
(66292, 213510)


In [26]:
# Multimoda 나이브 베이즈 분류기로 학습시킨다.
model = MultinomialNB().fit(X_train_tfidf, y_train)

In [27]:
import sklearn
from sklearn.metrics import accuracy_score

# 테스트 셋에 대한 결과의 정확도를 예측한다.
y_predict = model.predict(X_train_tfidf)
sklearn.metrics.accuracy_score(y_train, y_predict)
# y_predict.shape

0.71625016019479693

In [28]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)
print("Score = {0:.5f}".format(score))

Score = 0.71625


* Score = 0.59018
* Score = 0.58511 nltk 토크나이저 지정
* Score = 0.58648 
* Score = 0.58386

In [29]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict
result["Difference(Phrase)"] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

(156060, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...,2,1
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...,2,0
3,1,A series,2,A series,2,0
4,1,A,2,A,2,0
5,1,series,2,series,2,0


In [30]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

(8529,)


SentenceId
1    0.095238
2    0.333333
3    0.171429
4    0.500000
5    0.300000
Name: Difference(Phrase), dtype: float64

## Train

In [31]:
model.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
predictions = model.predict(X_test_tfidf)

print(predictions.shape)
predictions[0:10]

(66292,)


array([3, 3, 2, 2, 2, 3, 3, 3, 3, 2])

## Submit

In [33]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,2
156065,2


In [34]:
# 서브미션 파일 저장
submission.to_csv("data/baseline-script-nb.csv")

In [35]:
# 예측 점수(Score = 0.71625)는 기존 선형회귀보다 훨씬 높게 나오지만,  캐글스코어(0.56495)는 더 낮아졌다. 