In [1]:
import pandas as pd
import numpy as np

# Bag of Words

#### 블로그 : bag of words에 대한 설명 넣기

In [2]:
train=pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
"""
header = 0 은 파일의 첫 번째 줄에 열 이름이 있음을 나타내며 
delimiter = \t 는 필드가 탭으로 구분되는 것을 의미한다. csv파일이 뭘로 나눠져 있는지 
quoting = 3은 쌍따옴표를 무시하도록 한다.
"""
test=pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)

In [3]:
train.shape

(25000, 3)

In [4]:
test.shape

(25000, 2)

In [5]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [6]:
test.columns

Index(['id', 'review'], dtype='object')

In [7]:
train.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [8]:
pd.DataFrame(train.sentiment.value_counts())

Unnamed: 0,sentiment
1,12500
0,12500


In [9]:
# train["review"][0]
train.review[0][:700]
# 700자 까지만 보여주기

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik'

여기에 html태그들이 있어서 이것들을 없애주기 위해 beautifulsoup을 이용한다

### 데이터 정제 순서

1. beautifulsoup을 이용해서 html 태그 제거
2. 알파벳 이외의 문자를 공백으로 치환
3. nltk 데이터를 이용해서 stopword(불용어)를 제거
4. 어간추출

#### 블로그 :nltk 데이터가 무엇인지, stopword 즉 불용어가 무엇인지

In [10]:
# beautifulsoap 설치
!pip install BeautifulSoup4



In [11]:
from bs4 import BeautifulSoup

# beautifulSoup 을 한 영화 리뷰에 초기화시킨다
example1=BeautifulSoup(train['review'][0])
print("<html 태그 제거 전>")
print(train['review'][0][:700])
print("<html 태그 제거 후")
example1.get_text()[:700]

<html 태그 제거 전>
"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik
<html 태그 제거 후


'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

get_text()는 리뷰의 텍스트를 알게 해준다. beautifulsoup을 이용하게 되면 이것이 강력한 라이브버리라는 것을 알 수 있다. 하지만, 정규표현식에서 makrup을 업해는데는 신중한 연습이 필요하다.

데이터를 정제하는데 구두점(punctuation)을 없애는 것이 좋다. 
또, sentiment를 분석할 때 "!!!"나 ":-("와 같은 것들이 포함될 가능성이 있다. 이것들은 sentiment(감정)을 나타내고 단어처럼 처리되야 하지만, 이 튜토리얼에서는 단순함을 위해서 이런것들을 모두 제거한다. 
유사하게, 이 튜토리얼에서는 숫자를 제거하는데 이것들을 처리하는데 말이 되게 만드는 방법이 있다. 예를 들면, 이것들을 단어로 처리하거나 "NUM"과 같은 단어로 치환할 수도 있다.

구두점과 숫자를 제거하기 위해 우리는 정규표현식을 처리하는 패키지인 `re`를 이용해서 특수문자를 제거할 것이다. 이 패키지는 따로 설치할 필요는 없다. 

In [12]:
import re

letters_only=re.sub('[^a-zA-Z]', ' ', example1.get_text())
letters_only[:700]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

example1.get_text()로 얻은 텍스트에서 a-zA-Z가 아닌(^)문자들은 공백으로 치환한다.

리뷰를 소문자로 변환한 후에 그것들을 각자의 문자(토큰화, tokenization)로 나눈다.

In [13]:
lower_case=letters_only.lower()
words=lower_case.split()
print("토큰화 한 단어의 개수", len(words))
words[:10]

토큰화 한 단어의 개수 437


['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

#### 블로그 : 불용어 설명, 

stopwords라 불리는 불용어들을 어떻게 처리해야 하는지 알아보자. 빈번하게 나타나지만 실제 큰 의미를 나타내지 않는 단어들, 예를 들면 "a", "and", "is", "the", 불용어라고 하는데 이것들을 어떻게 처리해야 하는지 알아보자.

편리하게 파이썬 패키지에는 불용어들이 미리 정의되어 있는데, 파이썬 Natural Language Toolkit(NLTK)를 import한다.

In [15]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [16]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JANG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JANG\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

영어에서 불용어가 무엇인지 list로 보여준다. 

In [17]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

stopwords(불용어)를 제거한 토큰들

In [18]:
words=[w for w in words if not w in stopwords.words('english')]
print(len(words))
words[:10]

219


['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']