In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
imdb_data = pd.read_csv('IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data.describe() # 데이터셋 요약

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [4]:
imdb_data['sentiment'].value_counts() # 문장 수 count

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
# train, test 데이터셋 생성
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.sentiment[:40000]
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]

print('-----train dataset-----')
print(train_reviews.shape)
print(train_sentiments.shape)
print('-----test dataset-----')
print(test_reviews.shape)
print(test_sentiments.shape)

-----train dataset-----
(40000,)
(40000,)
-----test dataset-----
(10000,)
(10000,)


In [6]:
# 텍스트 정규화
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

tokenizer = ToktokTokenizer()
stopword_list= nltk.corpus.stopwords.words('english')

In [7]:
from bs4 import BeautifulSoup
import re

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_square_brackets(text)
    return text

In [8]:
imdb_data['review'] = imdb_data['review'].apply(denoise_text)
imdb_data['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [9]:
# 특수문자 제거
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

imdb_data['review'] = imdb_data['review'].apply(remove_special_characters)

In [10]:
#어간 추출

from nltk.stem.porter import PorterStemmer

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

imdb_data['review'] = imdb_data['review'].apply(simple_stemmer)

In [11]:
# 영어 불용어 제거

stop=set(stopwords.words('english'))
print(stop)

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

{'ma', 'only', 'am', 'such', 'your', 'over', 'itself', 'now', 'after', 'about', 'where', "hadn't", 'yourselves', 'm', 'in', 'having', 'under', 'the', 't', "won't", 'below', 'll', 'our', 'this', 'up', 'what', 'doing', 'when', 'couldn', 'myself', 'be', 'so', 'should', 'he', 'himself', "it's", 'their', 'will', 'whom', 'ourselves', 'or', 'further', 'to', 'we', 'a', 'no', 'didn', 'ours', 'with', 'off', 'herself', 'against', 'ain', 'aren', 'needn', 'nor', 'not', "hasn't", "she's", 'o', 'have', "shouldn't", 'hers', 'don', 'had', 'same', 'just', 'any', 'again', 'out', "that'll", 'did', 'here', 'if', 'they', 'some', 'shan', 'doesn', 'can', 'yours', 'haven', "weren't", 'was', 'more', 'for', 'his', 'are', 'until', 'does', 'very', 'has', 'too', "wasn't", 'and', "mightn't", 'shouldn', 'then', 'because', 'both', 'hasn', 'other', 'she', 'been', 'that', 'by', 'isn', 'all', "isn't", 'hadn', 'wouldn', 'it', "you'd", "didn't", "you've", 'before', 'is', 's', 'mustn', "shan't", "needn't", 'mightn', 'above'