In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
import pandas as pd
from preprocess import clean_by_freq
from preprocess import clean_by_len
from preprocess import clean_by_stopwords
from preprocess import stemming_by_porter
from preprocess import pos_tagger
from preprocess import words_lemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# 데이터 불러오기
df = pd.read_csv('imdb.tsv', delimiter = "\\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  df = pd.read_csv('imdb.tsv', delimiter = "\\t")


In [2]:
print(df['sent_tokens'][0])

['"watching time chasers, it obvious that it was made by a bunch of friends.', 'maybe they were sitting around one day in film school and said, \\""hey, let\'s pool our money together and make a really bad movie!\\"" or something like that.', 'what ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc.', "all corners were cut, except the one that would have prevented this film's release.", 'life\'s like that."']


In [3]:
# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

In [4]:
print(df['pos_tagged_tokens'][0])

[('``', '``'), ('watching', 'JJ'), ('time', 'NN'), ('chasers', 'NNS'), (',', ','), ('it', 'PRP'), ('obvious', 'VBZ'), ('that', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('made', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('bunch', 'NN'), ('of', 'IN'), ('friends', 'NNS'), ('.', '.'), ('maybe', 'RB'), ('they', 'PRP'), ('were', 'VBD'), ('sitting', 'VBG'), ('around', 'IN'), ('one', 'CD'), ('day', 'NN'), ('in', 'IN'), ('film', 'NN'), ('school', 'NN'), ('and', 'CC'), ('said', 'VBD'), (',', ','), ('\\', 'FW'), ("''", "''"), ("''", "''"), ('hey', 'NN'), (',', ','), ('let', 'VB'), ("'s", 'POS'), ('pool', 'VB'), ('our', 'PRP$'), ('money', 'NN'), ('together', 'RB'), ('and', 'CC'), ('make', 'VB'), ('a', 'DT'), ('really', 'RB'), ('bad', 'JJ'), ('movie', 'NN'), ('!', '.'), ('\\', 'NN'), ("''", "''"), ("''", "''"), ('or', 'CC'), ('something', 'NN'), ('like', 'IN'), ('that', 'DT'), ('.', '.'), ('what', 'WP'), ('ever', 'RB'), ('they', 'PRP'), ('said', 'VBD'), (',', ','), ('they', 'PRP'), ('still', 'RB'), ('ended',

In [5]:
# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

In [6]:
print(df['lemmatized_tokens'][0])

['``', 'watching', 'time', 'chaser', ',', 'it', 'obvious', 'that', 'it', 'be', 'make', 'by', 'a', 'bunch', 'of', 'friend', '.', 'maybe', 'they', 'be', 'sit', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'say', ',', '\\', "''", "''", 'hey', ',', 'let', "'s", 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', '!', '\\', "''", "''", 'or', 'something', 'like', 'that', '.', 'what', 'ever', 'they', 'say', ',', 'they', 'still', 'end', 'up', 'make', 'a', 'really', 'bad', 'movie', '--', 'dull', 'story', ',', 'bad', 'script', ',', 'lame', 'acting', ',', 'poor', 'cinematography', ',', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', ',', 'etc', '.', 'all', 'corner', 'be', 'cut', ',', 'except', 'the', 'one', 'that', 'would', 'have', 'prevent', 'this', 'film', "'s", 'release', '.', 'life', "'s", 'like', 'that', '.', "''"]


In [7]:
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

df[['cleaned_tokens']]

Unnamed: 0,cleaned_tokens
0,"[make, one, film, say, make, really, bad, movi..."
1,"[film, film]"
2,"[new, york, joan, barnard, elvire, audrey, bar..."
3,"[film, film, jump, send, n't, jump, radio, n't..."
4,"[site, movie, bad, even, movie, movie, make, m..."
5,"[ehle, northam, wonderful, wonderful, ehle, no..."
6,"[role, movie, n't, author, book, funny, author..."
7,"[plane, ceo, search, rescue, mission, call, ce..."
8,"[gritty, movie, movie, keep, sci-fi, good, kee..."
9,"[girl, girl]"


In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
import pandas as pd
from preprocess import clean_by_freq
from preprocess import clean_by_len
from preprocess import clean_by_stopwords
from preprocess import stemming_by_porter
from preprocess import pos_tagger
from preprocess import words_lemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def combine(sentence):
    return ' '.join(sentence)

# 데이터 불러오기
df = pd.read_csv('imdb.tsv', delimiter = "\\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 추가 전처리
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# 전처리 완료된 토큰 합치기
df['combined_corpus'] = df['cleaned_tokens'].apply(combine)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  df = pd.read_csv('imdb.tsv', delimiter = "\\t")


In [9]:
df[['cleaned_tokens']]

Unnamed: 0,cleaned_tokens
0,"[make, one, film, say, make, really, bad, movi..."
1,"[film, film]"
2,"[new, york, joan, barnard, elvire, audrey, bar..."
3,"[film, film, jump, send, n't, jump, radio, n't..."
4,"[site, movie, bad, even, movie, movie, make, m..."
5,"[ehle, northam, wonderful, wonderful, ehle, no..."
6,"[role, movie, n't, author, book, funny, author..."
7,"[plane, ceo, search, rescue, mission, call, ce..."
8,"[gritty, movie, movie, keep, sci-fi, good, kee..."
9,"[girl, girl]"


In [10]:
def combine(sentence):
    return ' '.join(sentence)

In [11]:
df['combined_corpus'] = df['cleaned_tokens'].apply(combine)

df[['combined_corpus']]

Unnamed: 0,combined_corpus
0,make one film say make really bad movie like s...
1,film film
2,new york joan barnard elvire audrey barnard jo...
3,film film jump send n't jump radio n't send re...
4,site movie bad even movie movie make movie spe...
5,ehle northam wonderful wonderful ehle northam ...
6,role movie n't author book funny author author...
7,plane ceo search rescue mission call ceo harla...
8,gritty movie movie keep sci-fi good keep suspe...
9,girl girl
