In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
import pandas as pd
from preprocess import clean_by_freq
from preprocess import clean_by_len
from preprocess import clean_by_stopwords
from preprocess import stemming_by_porter
from preprocess import pos_tagger
from preprocess import words_lemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:

def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
        
    return encoded_idx

# 데이터 불러오기
df = pd.read_csv('imdb.tsv', delimiter = "\\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 추가 전처리
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# 하나의 행 정수 인코딩
word_to_idx = {}
i = 0
encoded_idx = []

  df = pd.read_csv('imdb.tsv', delimiter = "\\t")


In [4]:
tokens = df['cleaned_tokens'][4]

vocab = Counter(tokens)
vocab = vocab.most_common()

print(vocab)

[('movie', 10), ('jim', 7), ('stand-up', 3), ('day', 3), ('really', 3), ('terrible', 3), ('site', 2), ('bad', 2), ('even', 2), ('make', 2), ('special', 2), ('describe', 2), ('like', 2), ('actor', 2), ('love', 2), ('stand', 2), ('comedian', 2)]


In [5]:
word_to_idx = {}
i = 0

for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

print(word_to_idx)

{'movie': 1, 'jim': 2, 'stand-up': 3, 'day': 4, 'really': 5, 'terrible': 6, 'site': 7, 'bad': 8, 'even': 9, 'make': 10, 'special': 11, 'describe': 12, 'like': 13, 'actor': 14, 'love': 15, 'stand': 16, 'comedian': 17}


In [6]:
encoded_idx = []

for token in tokens:
    idx = word_to_idx[token]
    encoded_idx.append(idx)

print(encoded_idx)

[7, 1, 8, 9, 1, 1, 10, 1, 11, 12, 1, 1, 12, 1, 2, 10, 3, 4, 3, 2, 13, 2, 14, 15, 16, 4, 17, 11, 2, 4, 9, 7, 15, 2, 3, 2, 14, 1, 16, 17, 2, 13, 5, 6, 5, 6, 1, 6, 5, 8, 1]


In [7]:
tokens = sum(df['cleaned_tokens'], [])

print(tokens)

['make', 'one', 'film', 'say', 'make', 'really', 'bad', 'movie', 'like', 'say', 'make', 'really', 'bad', 'movie', 'bad', 'one', 'film', 'like', 'film', 'film', 'new', 'york', 'joan', 'barnard', 'elvire', 'audrey', 'barnard', 'john', 'saxon', 'italy', 'etruscan', 'tomb', 'joan', 'italy', 'colleague', 'italy', 'maggot', 'maggot', 'joan', 'drug', 'drug', 'tomb', 'colleague', 'story', 'end', 'new', 'york', 'joan', 'colleague', 'romantic', 'end', 'waste', 'time', 'watch', 'story', 'romantic', 'end', 'elvire', 'audrey', 'john', 'saxon', 'maggot', 'watch', 'etrusco', 'watch', 'waste', 'time', 'etrusco', 'etruscan', 'film', 'film', 'jump', 'send', "n't", 'jump', 'radio', "n't", 'send', 'reporter', 'fear', 'jump', 'fear', 'radio', 'reporter', "n't", 'radio', "n't", "n't", 'site', 'movie', 'bad', 'even', 'movie', 'movie', 'make', 'movie', 'special', 'describe', 'movie', 'movie', 'describe', 'movie', 'jim', 'make', 'stand-up', 'day', 'stand-up', 'jim', 'like', 'jim', 'actor', 'love', 'stand', 'da

In [8]:
word_to_idx = {}
i = 0
tokens = sum(df['cleaned_tokens'], [])

vocab = Counter(tokens)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

In [9]:
print(word_to_idx)

{'movie': 1, 'film': 2, "n't": 3, 'scene': 4, 'bad': 5, 'time': 6, 'reason': 7, 'make': 8, 'jim': 9, 'good': 10, 'one': 11, 'like': 12, 'could': 13, "'re": 14, 'quastel': 15, 'really': 16, 'even': 17, 'monster': 18, 'joan': 19, 'love': 20, 'author': 21, 'try': 22, 'dialogue': 23, 'idea': 24, 'italy': 25, 'colleague': 26, 'maggot': 27, 'end': 28, 'watch': 29, 'jump': 30, 'radio': 31, 'stand-up': 32, 'day': 33, 'terrible': 34, 'ehle': 35, 'northam': 36, 'search': 37, 'rescue': 38, 'call': 39, 'knowles': 40, 'henriksen': 41, 'easily': 42, 'see': 43, 'appear': 44, 'get': 45, 'character': 46, 'think': 47, 'use': 48, 'whether': 49, 'need': 50, 'though': 51, 'sci-fi': 52, 'look': 53, 'say': 54, 'new': 55, 'york': 56, 'barnard': 57, 'elvire': 58, 'audrey': 59, 'john': 60, 'saxon': 61, 'etruscan': 62, 'tomb': 63, 'drug': 64, 'story': 65, 'romantic': 66, 'waste': 67, 'etrusco': 68, 'send': 69, 'reporter': 70, 'fear': 71, 'site': 72, 'special': 73, 'describe': 74, 'actor': 75, 'stand': 76, 'comed

In [10]:
def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
        
    return encoded_idx

df['integer_encoded'] = df['cleaned_tokens'].apply(lambda x: idx_encoder(x, word_to_idx))
df[['integer_encoded']]

Unnamed: 0,integer_encoded
0,"[8, 11, 2, 54, 8, 16, 5, 1, 12, 54, 8, 16, 5, ..."
1,"[2, 2]"
2,"[55, 56, 19, 57, 58, 59, 57, 60, 61, 25, 62, 6..."
3,"[2, 2, 30, 69, 3, 30, 31, 3, 69, 70, 71, 30, 7..."
4,"[72, 1, 5, 17, 1, 1, 8, 1, 73, 74, 1, 1, 74, 1..."
5,"[35, 36, 78, 78, 35, 36, 79, 79, 35, 36]"
6,"[80, 1, 3, 21, 81, 82, 21, 21, 80, 3, 82, 83, ..."
7,"[85, 86, 37, 38, 87, 39, 86, 88, 40, 89, 41, 9..."
8,"[120, 1, 1, 121, 52, 10, 121, 122, 53, 1, 52, ..."
9,"[123, 123]"
