## 지도학습 기반 감성분석

- 감성분석은 문서 내 텍스트가 나타내는 여러 가지 주관적인 단어와 문맥을 기반으로 감성 수치를 계산하는 방법을 이용
- 감성 지수는 긍정 감성 지수와 부정 감성 지수로 구성되며 이들 지수를 합산해 긍정 또는 부정 감성을 결정
- 지도 학습은 학습 데이터와 타깃 레이블 값을 기반으로 감성 분석 학습을 수행한 뒤 이를 기반으로 다른 데이터의 감성 분석을 예측하는 방법
- 비지도 학습은 'Lexicon'이라는 일종의 감성 어휘 사전을 이용. Lexicon의 감성 분석을 위한 용어와 문맥에 대한 다양한 정보를 이용해 문서의 긍정적 부정적 감성 여부를 판단

In [1]:
# 지도학습 기반 - IMDB 영화평
# https://www.kaggle.com/c/word2vec-nlp-tutorial/data

import pandas as pd
review_df = pd.read_csv('./dataset/labeledTrainData.tsv', header = 0, sep='\t', quoting=3)
print(review_df.head(3))
review_df.shape
review_df.columns.values

         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...


array(['id', 'sentiment', 'review'], dtype=object)

In [2]:
print(review_df.review[3])

"It must be assumed that those who praised this film (\"the greatest filmed opera ever,\" didn't I read somewhere?) either don't care for opera, don't care for Wagner, or don't care about anything except their desire to appear Cultured. Either as a representation of Wagner's swan-song, or as a movie, this strikes me as an unmitigated disaster, with a leaden reading of the score matched to a tricksy, lugubrious realisation of the text.<br /><br />It's questionable that people with ideas as to what an opera (or, for that matter, a play, especially one by Shakespeare) is \"about\" should be allowed anywhere near a theatre or film studio; Syberberg, very fashionably, but without the smallest justification from Wagner's text, decided that Parsifal is \"about\" bisexual integration, so that the title character, in the latter stages, transmutes into a kind of beatnik babe, though one who continues to sing high tenor -- few if any of the actors in the film are the singers, and we get a double 

In [3]:
# re.sub 사용법
import re
# 사과 혹은 오렌지를 과일로 대체
re.sub('apple|orange', 'fruit', 'apple box orange tree')

'fruit box fruit tree'

In [4]:
# df/series 에서 str 적용 문자열 연산 수행
import re
# <br> html 태그는 replace 함수로 공백으로 변환
review_df.review = review_df.review.str.replace('<br />', ' ')
# 파이썬의 정규 표현식 모듈 re를 이용하여 영어 문자열이 아닌 문자는 모두 공백으로 변환
# 알파벳이 아닌 것 모두 공백으로 변환
review_df.review = review_df.review.apply(lambda x : re.sub('[^a-zA-Z]', ' ', x))
review_df.review[3]

' It must be assumed that those who praised this film    the greatest filmed opera ever    didn t I read somewhere   either don t care for opera  don t care for Wagner  or don t care about anything except their desire to appear Cultured  Either as a representation of Wagner s swan song  or as a movie  this strikes me as an unmitigated disaster  with a leaden reading of the score matched to a tricksy  lugubrious realisation of the text   It s questionable that people with ideas as to what an opera  or  for that matter  a play  especially one by Shakespeare  is   about   should be allowed anywhere near a theatre or film studio  Syberberg  very fashionably  but without the smallest justification from Wagner s text  decided that Parsifal is   about   bisexual integration  so that the title character  in the latter stages  transmutes into a kind of beatnik babe  though one who continues to sing high tenor    few if any of the actors in the film are the singers  and we get a double dose of A

In [5]:
from sklearn.model_selection import train_test_split
class_df = review_df.sentiment
feature_df = review_df.drop(['id', 'sentiment'], axis = 1, inplace = False)
x_train, x_test, y_train, y_test = train_test_split(
    feature_df
    , class_df
    , test_size=0.3
    , random_state=156
)

x_train.shape, x_test.shape

((17500, 1), (7500, 1))

In [6]:
from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix, f1_score, roc_auc_score

def get_clf_eval(y_test , pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
  
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}'.format(accuracy, precision, recall, f1))

In [7]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# 스톱 워드는 English, filtering, ngram은 (1,2)로 설정해 CountVectorization수행. 
# LogisticRegression의 C는 10으로 설정.
# 파이프라인을 이용하여 한번에 수행(명령 한꺼번에)
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10))])
# Pipeline 객체를 이용하여 fit(), predict()로 학습/예측 수행. predict_proba()는 
# roc_auc때문에 수행.

# 학습
pipeline.fit(x_train.review, y_train)

# 예측
pred = pipeline.predict(x_test.review)

# ROC-AUC 때문에 predict_proba 필요
pred_probs = pipeline.predict_proba(x_test.review)[:,1]

print('예측 정확도 : {0:.4f}, ROC-AUC : {1:.4f}'.format(accuracy_score(y_test,pred), roc_auc_score(y_test,pred_probs)))

예측 정확도 : 0.8865, ROC-AUC : 0.9508


In [10]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10))])
# Pipeline 객체를 이용하여 fit(), predict()로 학습/예측 수행. predict_proba()는 
# roc_auc때문에 수행.

# 학습
pipeline.fit(x_train.review, y_train)

# 예측
y_pred = pipeline.predict(x_test.review)

# ROC-AUC 때문에 predict_proba 필요
pred_probs = pipeline.predict_proba(x_test.review)[:,1]

get_clf_eval(y_test, y_pred)
print()
print('ROC-AUC: ', roc_auc_score(y_test, pred_probs))

오차 행렬
[[3257  423]
 [ 376 3444]]
정확도: 0.8935, 정밀도: 0.8906, 재현율: 0.9016,    F1: 0.8961

ROC-AUC:  0.9597786962212611


## 비지도학습 기반 감성 분석

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\a

[nltk_data]    |   Package timit is already up-to-date!
[nltk_data]    | Downloading package toolbox to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package toolbox is already up-to-date!
[nltk_data]    | Downloading package treebank to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package treebank is already up-to-date!
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     C:\U

True

In [3]:
# 'present' 라는 단어로 wordnet의 sysnset 생성
# synsets 호출 시 Synset 객체를 가지는 list를 반환
# POS(Part of speech) 태그는 의미, 품사, 인덱스로 구성

from nltk.corpus import wordnet as wn
term = 'present'
synsets = wn.synsets(term)
print('sysnsets() 반환 type: ', type(synsets))
print('sysnsets() 반환 값 개수: ', len(synsets))
print('sysnsets() 반환 값: ', synsets)
# 단어 하나의 경우의 수 파악하기

sysnsets() 반환 type:  <class 'list'>
sysnsets() 반환 값 개수:  18
sysnsets() 반환 값:  [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [4]:
# synsets 객체가 가지는 속성
for synset in synsets:
    print('##### Synset name: ', synset.name(), ' #####')
    print('POS :', synset.lexname())
    print('Definition :', synset.definition())
    print('Lemmas: ', synset.lemma_names())

##### Synset name:  present.n.01  #####
POS : noun.time
Definition : the period of time that is happening now; any continuous stretch of time including the moment of speech
Lemmas:  ['present', 'nowadays']
##### Synset name:  present.n.02  #####
POS : noun.possession
Definition : something presented as a gift
Lemmas:  ['present']
##### Synset name:  present.n.03  #####
POS : noun.communication
Definition : a verb tense that expresses actions or states at the time of speaking
Lemmas:  ['present', 'present_tense']
##### Synset name:  show.v.01  #####
POS : verb.perception
Definition : give an exhibition of to an interested audience
Lemmas:  ['show', 'demo', 'exhibit', 'present', 'demonstrate']
##### Synset name:  present.v.02  #####
POS : verb.communication
Definition : bring forward and present to the mind
Lemmas:  ['present', 'represent', 'lay_out']
##### Synset name:  stage.v.01  #####
POS : verb.creation
Definition : perform (a play), especially on a stage
Lemmas:  ['stage', 'present

In [5]:
import pandas as pd

# WordNet은 어떤 어휘와 다른 어휘 간의 관계를 유사도로 나타냄
# 유사도를 나타내기 위해 path_similarity() 메소드 제공
# synset 객체를 단어별로 생성

tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree , lion , tiger , cat , dog]
similarities = []
entity_names = [ entity.name().split('.')[0] for entity in entities ]

# 단어별 synset 들을 iteration 하면서 다른 단어들의 synset과 유사도를 측정 
for entity in entities:
    similarity = [ round(entity.path_similarity(compared_entity), 2)  for compared_entity in entities ]
    similarities.append(similarity)
    
# 개별 단어별 synset과 다른 단어의 synset과의 유사도를 DataFrame형태로 저장합니다.  
similarity_df = pd.DataFrame(similarities , columns=entity_names,index=entity_names)
similarity_df
# lion은 tree와의 유사도가 가장 적고 tiger와는 유사도가 가장 큼

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


### Q. tree, dog, cat 과 유사도가 높은 단어를 구하시오.

In [None]:
tree = wn.synset('tree.n.01')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

all = wn.synset('*.n.01')

entities = []
entities.append(all)

similarities = []
entity_names = [ entity.name().split('.')[0] for entity in entities ]

for entity in entities:
    similarity = [ round(entity.path_similarity(compared_entity), 2)  for compared_entity in entities ]
    similarities.append(similarity)
    
# 개별 단어별 synset과 다른 단어의 synset과의 유사도를 DataFrame형태로 저장합니다.  
similarity_df = pd.DataFrame(similarities , columns=entity_names,index=entity_names)
similarity_df

"""
def similarity(word):
    tree = wn.synset('tree.n.01')
    cat = wn.synset('cat.n.01')
    dog = wn.synset('dog.n.01')
    
    word = [tree, cat, dog]
    
    if word.path_similarity([compared_word for compared_word in input_words]) >= 0.8:
        return print(compared_word)

    
any_word = []

i = 0
for i<10:
    random = input("enter word(10 words):")
    any_word.append(random)

similarity(any_word)
"""

## VADER lexicon을 이용한 Sentiment Analysis
- SenitimentIntensityAnalyzer 클래스를 이용하여 쉽게 갑성 분석 제공

In [6]:
import  nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
import pandas as pd
review_df = pd.read_csv('./dataset/labeledTrainData.tsv', header = 0, sep = '\t', quoting = 3)
print(review_df.head(3))
review_df.shape
review_df.columns.values

         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...


array(['id', 'sentiment', 'review'], dtype=object)

In [21]:
print(review_df.review[0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [22]:
# df/series에서 str 적용 문자열 연산 수행 
import re
# <br> html 태그는 replace 함수로 공백으로 변환
review_df.review = review_df.review.str.replace('<br />',' ')
# 파이썬의 정규 표현식 모듈인 re를 이용하여 영어 문자열이 아닌 문자는 
# 모두 공백으로 변환
review_df.review = review_df.review.apply(lambda x : re.sub('[^a-zA-Z]', ' ',x))
# review_df.review[0]

In [44]:
print(review_df.review[0])

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [24]:
# NLTK 서브모듈로 SentimentIntensityAnalyzer 임포트, IMDB 감상평 감성 분석
# neg는 부정, neu는 중립, pos는 긍정, compound는 조합한 감성지수
# compound score는 -1 ~1 사이의 감성지수를 표현, 0.1 이상이면 긍정 감성
# 그 이하는 부정 감성으로 판단하나 사오하엥 따라 임계값을 조정해 예측 성능 조절
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_anlayzer = SentimentIntensityAnalyzer()
senti_scores = senti_anlayzer.polarity_scores(review_df.review[1])
print(senti_scores)

{'neg': 0.082, 'neu': 0.691, 'pos': 0.227, 'compound': 0.9783}


In [25]:
review_df.review[1]

'   The Classic War of the Worlds   by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H  G  Wells  classic book  Mr  Hines succeeds in doing so  I  and those who watched his film with me  appreciated the fact that it was not the standard  predictable Hollywood fare that comes out every year  e g  the Spielberg version with Tom Cruise that had only the slightest resemblance to the book  Obviously  everyone looks for different things in a movie  Those who envision themselves as amateur   critics   look only to criticize everything they can  Others rate a movie on more important bases like being entertained  which is why most people never agree with the   critics    We enjoyed the effort Mr  Hines put into being faithful to H G  Wells  classic novel  and we found it to be very entertaining  This made it easy to overlook what the   critics   perceive to be its shortcomings  '

### Q. https://www.imdb.com/chart/top/ 에서 ranking top 3와 ranking 248-250 영화에대한 user review에 대하여 비지도 학습으로 compound를 구하고 '긍정', '부정' 감성여부

In [43]:
top3 = '''"The Godfather: Part II" is a very suspenseful drama with a very exciting story, with great acting and great special effects. I would definitely recommend you watch this movie...but first watch the original classic from 1972 "The Godfather" . The movie may not be as good as the first movie but is still an amazing sequel.'''

top248 = '''If you like James Cagney and you like the film noirs of the late 1940s, well, it doesn't get much better than this.

Cagney, who was always great at playing wild gangsters, makes this film interesting all the way through its two hours. Despite being a half-century old, he was still not far from being at the top of his game. His character, Cody Jarrett, is one of the most famous of the many he portrayed on film, which is saying a lot.

Who could sit on his mother's lap and still look like a tough guy? Not many, but Cagney pulled it off here with his tough mama, played really well by Margaret Wycherly. This was a new type of role for Wycherly, who was used to doing Shakespeare. You wouldn't know it from this "Ma Jarrett" role!

The "hoods" in here are all realistic tough guys and gals. Cagney's two-faced wife is played well by Virginia Mayo, who plays the typical (for this genre) floozy blonde whom you can trust about as far as you can throw.

The final scene - "Top Of World, Ma!" - is one of the most famous in all of film history. It's nice to see a nice print of this out on DVD now and some of the features are very informative. Included is an interview with Mayo, who still looks pretty good for an old lady!'''


top249 = '''Washizu is a brave samurai who helps his lord to fight off a violent rebellion. Washizu and his friend Miki are riding through Cobweb Forest when a spirit appears to them and makes predictions which fire their ambitions. When Washizu explains this vision to his wife Asaji, she urges him to murder his lord and rule in his stead. Thus the tragedy begins.

Kurosawa's interpretation of Macbeth is visually fascinating. Swirling mist, colossal trees dripping with rain, rich black volcanic soil and bulky fortress architecture provide the imposing, dread-laden backdrop against which the humans move in superbly stylized patterns. The director chose to shoot the action on Mount Fuji precisely because of the volcanic soil - and even had truckloads brought to the studio for pickup shots.

Westerners unfamiliar with Noh are missing a huge part of the film's meaning. This thousand-year-old theatrical tradition corresponds broadly to our Elizabethan Tragedy, and Kurosawa shows how the two cultural strains, eastern and western, interlock and interact. The one illumines the other.

The Noh stage must have on it three pine branches and a symbolic Shinto temple-arch. In the film, shots are carefully composed to include tangles of branches in the foreground, and the vast entrance gate of Washizu's fortress serves for the temple arch. And yet Kurosawa is not including these details redundantly, for mere form's sake - the ubiquitous branches, framing the human action, remind us all the time of the forest nemesis awaiting Washizu. The arch is Washizu's interface with the world - open in the early stages, but gradually less so as the protagonist retreats into his own diseased inner self.

A Noh play features a "doer" (Shite) and a "companion" (Waku) who plays a subordinate role. Washizu and Asaji are the Shite and Waku respectively. Elements in the Noh include a battle-drama (we get one here) and a so-called "wig drama", in which a female character dominates the action. This is the central portion of the film, in the quiet of the fortress quarters, when Asaji ruthlessly manipulates her husband's ambition. Every Noh play has a ghost which appears to the Shite, and the spirit in the forest fulfils that function. Noh plays are never original works, in that (by a venerable convention) they are re-workings of ancient legends. Kurosawa follows tradition by quarrying his tale from Shakespeare's play.

There is no western term to describe the stylized striking of poses so important in Noh. Our word "dance" is a crude word which approximates to, but does not convey, the grace of the Japanese art-form. Asaji, alone with the blood-stain, gives us a glimpse of this delightful ritual.

Finally, Noh contains an aural richness almost totally absent from western tragedy - the complex rhythms of stamping and percussion which accompany the spoken word. In the film, the rhythmic patterns of horses' hooves on soil, and Washizu's bare feet on the boards of the banquet hall, are meant to reinforce the mood as they creep into our emotions by subliminal insistence.

Isuzu Yamada is terrific as Asaji. Her stillness absolutely oozes determination, contrasting strongly with her husband's hollow bluster.

It seems that Kurosawa cherished the concept of a Noh Macbeth for some years before committing it to celluloid. Apparently the project had to be scrapped in 1952 because Welles' Macbeth was nearing completion, and Kurosawa did not want the two films to suffer by being endlessly compared. This version, then, had to wait until 1957 to be realised.

The director is not afraid to add his own flourishes to the well-known story. We hear of the notorious traitor Fujimaki who disembowelled himself in a room of the fortress. The exact spot is now known as the Forbidden Room, a place of evil omen with its indelible bloodstain on the floor. It is a symbol which encapsulates the spirit of the film, interweaving the related themes of treachery, blood and guilt. In a brilliant transition, we are taken to a change of scene by the ripping down of a banner by galloping horsemen. Washizu at the pinnacle of his arrogance is filmed from below with severe foreshortening, conveying his vainglory more effectively than words ever could. The death scene, with its railing, hysterical protagonist and relentless volleys of arrows (their grouped shafts recalling the fateful forest) has enormous power and lives long in the viewer's memory.'''


top250 = '''Set in Cappadochia, central Anatolia, WINTER SLEEP (KIS UYKUSU) focuses on the life of Aydın (Haluk Bilginer) a retired actor who now runs the Hotel Othello. The name is significant, as it reveals his true preoccupation with performance, a trait reinforced by the framed bills on his study wall. With plenty of family money at his disposal he has no need to work, but that does not stop him from screwing every penny out of his tenants with the help of his henchperson Hidayet (Ayberk Pekcan). Although perpetually drawing attention to his poor background and unhappy childhood, it's clear that Aydın's life revolves totally around himself; and that the only way he can salve his conscience is to make charitable donations, preferably anonymously.

With KIŞ UYKUSU we are back on thematic territory that director Nuri Bilge Ceylan previously explored in KASABA. He readily acknowledges Chekhov as an inspiration for creating a world where no one has much to do except talk to one another. Aydın busies himself with a variety of tasks, including writing a column for the local newspaper and writing a book on the history of the Turkish theater. His sister Necla (Demet Akbağ) spends much of her time lolling on the sofa and wondering whether she should forgive her ex-husband for an unhappy marriage. Aydın's wife Nihal (Melissa Sözen) is equally indolent; her sole aim in life seems to be to chair a committee of prosperous locals dedicated to raising money for the local school.

Stylistically speaking KIŞ UYKUSU is slightly different from Ceylan's earlier work; there are fewer reflective sequences designed to prompt reflection on the landscape and the elements, and more face-to-face confrontations between the protagonists. They emphasize the basic emptiness of their lives, as they have nothing to but talk and talk, in contrast to their tenants - for example the local imam Hamdi (Serhat Kılıç) who wonders about taking a second job so as to make ends meet. On the other hand these lengthy conversations draw attention to the protagonists' love of surfaces; unable (or unwilling) to engage with life's realities, they would rather talk at rather than with one another.

The unbelievable landscapes of Cappadochia in winter, with its fairy chimneys and unspoiled Anatolian terrain, offers a point of contrast to the characters' musings. While they spend their time both literally and mentally imprisoned within Aydın's hotel, the landscape offers a reminder of timeless virtues, as well as the fact that nature continues to flourish in spite of humanity's best attempts to destroy it.

The film comes to a climactic conclusion when Ceylan brings the indolent characters into contact with those forced to eke out an existence in harsh conditions. Nihal offers a financial gift to Hamdi's family; but fails to understand how such an act of apparent goodwill represents the ultimate insult. As Hamdi's brother İsmail (Nejat İsler) contends, it is nothing more than conscience money to atone for the fact that Aydın's family were responsible for causing İsmail's son Ilyas's (Emirhan Doruktutan's) pneumonia earlier on in the film. Meanwhile Aydın discovers to his cost that the local educator Levent (Nadir Sarıbacak) has a jaundiced view of all wealthy philanthropists.

Yet such experiences do not lead to any form of redemption. The film ends with Aydın and Nihal sitting morosely in their deserted hotel, looking out of the window at the snow-covered vista beyond, imprisoned by their lack of perception.

This film won the Palme d'Or at Cannes; it deserves every success. A modern classic.'''

In [42]:
# NLTK 서브모듈로 SentimentIntensityAnalyzer 임포트, IMDB 감상평 감성 분석
# neg는 부정, neu는 중립, pos는 긍정, compound는 조합한 감성지수
# compound score는 -1 ~1 사이의 감성지수를 표현, 0.1 이상이면 긍정 감성
# 그 이하는 부정 감성으로 판단하나 사오하엥 따라 임계값을 조정해 예측 성능 조절
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_anlayzer = SentimentIntensityAnalyzer()
senti_scores1 = senti_anlayzer.polarity_scores(top3)
senti_scores2 = senti_anlayzer.polarity_scores(top248)
senti_scores3 = senti_anlayzer.polarity_scores(top249)
senti_scores4 = senti_anlayzer.polarity_scores(top250)
print(senti_scores1)
print(senti_scores2)
print(senti_scores3)
print(senti_scores4)

{'neg': 0.026, 'neu': 0.668, 'pos': 0.305, 'compound': 0.9427}
{'neg': 0.025, 'neu': 0.781, 'pos': 0.195, 'compound': 0.9916}
{'neg': 0.104, 'neu': 0.798, 'pos': 0.099, 'compound': -0.7901}
{'neg': 0.092, 'neu': 0.786, 'pos': 0.121, 'compound': 0.961}


In [49]:
# 평가 사용자 함수
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print()
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

#get_clf_eval(y_target, vader_pred)

In [50]:
# VADER를 이용한 IMDB 감성 분석 수행
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # compound 값에 기반해 threshold 입력값보다 크면 1, 아니면 0을 반환
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

review_df.vader_preds = review_df.review.apply(lambda x:vader_polarity(x,0.1))
y_target = review_df.sentiment.values
vader_preds = review_df.vader_preds.values

print('VADER 예측 성능 평가 : ')

get_clf_eval(y_target, vader_preds)

VADER 예측 성능 평가 : 
오차 행렬
[[ 6736  5764]
 [ 1867 10633]]

정확도: 0.6948, 정밀도: 0.6485, 재현율: 0.8506, F1: 0.7359
