In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from collections import Counter
import pandas as pd
from preprocess import clean_by_freq
from preprocess import clean_by_len
from preprocess import clean_by_stopwords
from preprocess import stemming_by_porter
from preprocess import penn_to_wn
from preprocess import pos_tagger
from preprocess import words_lemmatizer
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# 데이터 불러오기
df = pd.read_csv('imdb.tsv', delimiter = "\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 추가 전처리
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# SentiWordnet 감성 분석
pos_tagged_words = df['pos_tagged_tokens'][0]
senti_score = 0

for word, tag in pos_tagged_words:
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        continue
    
    if not wn.synsets(word, wn_tag):
        continue
    else:
        synsets = wn.synsets(word, wn_tag)
    
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    
    word_senti_score = (swn_synset.pos_score() - swn_synset.neg_score())
    senti_score += word_senti_score

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/songchangseokk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
df[['pos_tagged_tokens']]

Unnamed: 0,pos_tagged_tokens
0,"[(watching, VBG), (time, NN), (chasers, NNS), ..."
1,"[(i, NN), (saw, VBD), (this, DT), (film, NN), ..."
2,"[(minor, JJ), (spoilers, NNS), (in, IN), (new,..."
3,"[(i, JJ), (went, VBD), (to, TO), (see, VB), (t..."
4,"[(yes, UH), (,, ,), (i, JJ), (agree, VBP), (wi..."
5,"[(jennifer, NN), (ehle, NN), (was, VBD), (spar..."
6,"[(amy, JJ), (poehler, NN), (is, VBZ), (a, DT),..."
7,"[(a, DT), (plane, NN), (carrying, VBG), (emplo..."
8,"[(a, DT), (well, NN), (made, VBN), (,, ,), (gr..."
9,"[(incredibly, RB), (dumb, JJ), (and, CC), (utt..."


In [7]:
pos_tagged_words = df['pos_tagged_tokens'][0]
senti_score = 0

for word, tag in pos_tagged_words:
    # PennTreeBank 기준 품사를 WordNet 기준 품사로 변경
    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        continue
    
    # Synset 확인, 어휘 사전에 없을 경우에는 스킵
    if not wn.synsets(word, wn_tag):
        continue
    else:
        synsets = wn.synsets(word, wn_tag)
    
    # SentiSynset 확인
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    
    # 감성 지수 계산
    word_senti_score = (swn_synset.pos_score() - swn_synset.neg_score())
    senti_score += word_senti_score

In [8]:
print(senti_score)