In [61]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import BatchNormalization

es = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

EPOCHS = 1000

In [62]:
df = pd.read_csv('asian_merged.csv', sep=",")

In [64]:
df = df.drop('Unnamed: 0', axis=1)

In [65]:
df

Unnamed: 0,title,review,rating
0,Don't dare blink as you may miss something.,"Wong Kar Mun went blind at the age of two, 18 ...",9/10
1,Excellent horror film with a weak ending,Mun (Angelica Lee) is a young woman who has be...,7/10
2,A Corneal Transplant from the Twilight Zone.,This is not the first time that a movie where ...,8/10
3,"One of the year's best horror movies, though t...",Even the website of this movie gave me the cre...,8/10
4,Good and scary!!!,"Of all the horror movie genres in existence, g...",10/10
...,...,...,...
1480,A rewarding experience,A Tale Of Two Sisters is a heart-wrenching sto...,10/10
1481,"Creepy, Cringe, Disturbing,Twisted",Endless expectation for something to happen. C...,6/10
1482,It will make you scream like a teenage girl.,I saw this movie while I was in London a few m...,9/10
1483,Not as good as I hoped,I watched this film as part of the EXTREME ASI...,10/10


In [66]:
df['review'][88]

'Let\'s put aside the fact that this film has some creepy moments but zero actual scares. Sorry, I jumped all throughout The Grudge and I knew while I was jumping that it was a terrible film. This didn\'t scare me. Noroi? Crapped my pants. This? I was laughing two thirds of the way through. Why?Because I just might nominate this film for most contrived ending of all time. Even disregarding the sheer amount of plot devices this film borrowed (Angel Heart and Jacob\'s Ladder are all over this film), the last third was absolutely dumbfounding. My jaw was on the floor from the sheer melodrama of it all. I saw this film on lists with Ju-on, Ringu, and a few other notable J-scarers, but boy, if this scared you, Ringu will literally kill you. For the record, Ringu didn\'t scare me either. Not like the American remake did. But that\'s a whole other can of worms.The Eye suffers from a serious dollop of schmaltz. Romantic subplot? Check. Ridiculously cheery classical piano music? Check. A "thing

In [67]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from tqdm import tqdm
from nltk import sent_tokenize,word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [68]:
main = []

# Storing all punctuations using RE library like !;,"% etc
re_puncs = re.compile('[%s]' % re.escape(string.punctuation))
# Storing all stop words like a, an, the, when, there, this etc
stop_word  = set(stopwords.words('english'))
stop_word.add("im")
# print(stop_word)


In [69]:
sentences_listed = [line.split(" ") for line in df['review']]

import gensim

word_model = gensim.models.Word2Vec(sentences = sentences_listed, vector_size = 100)
words = list(word_model.wv.key_to_index)
print(len(words))

5342


In [70]:
!pip install gensim



In [71]:
word_model.wv.most_similar("actress")

[('Angelica', 0.9961642026901245),
 ('moving', 0.9956340193748474),
 ('telling', 0.9948765635490417),
 ('role', 0.9940662980079651),
 ('sets', 0.9936408400535583),
 ("She's", 0.9934767484664917),
 ('.', 0.9933796525001526),
 ('working', 0.9930374026298523),
 ('conveying', 0.9928699135780334),
 ('legend', 0.9926580786705017)]

In [72]:
word_model.wv.most_similar("death")

[('donor', 0.994712233543396),
 ('investigate', 0.9944313764572144),
 ('visions', 0.993080198764801),
 ('victims', 0.9923332333564758),
 ('calls', 0.9919072389602661),
 ('hospital', 0.9918200969696045),
 ('home,', 0.9914587736129761),
 ('killed', 0.9912410378456116),
 ('solve', 0.9911643862724304),
 ('died', 0.9908338189125061)]

In [73]:
word_model.wv.most_similar("scary")

[('quite', 0.9615159034729004),
 ('good', 0.9479793906211853),
 ('scary.', 0.9355578422546387),
 ('pretty', 0.9294544458389282),
 ('done', 0.9261035323143005),
 ('but', 0.9260631203651428),
 ("it's", 0.9231249094009399),
 ('remotely', 0.9192637801170349),
 ('perfect,', 0.9183173179626465),
 ('bad', 0.9171686172485352)]

In [74]:
# Making Lemmatizing object
lem = WordNetLemmatizer()
# Using Porter Stemmer
p_stem = PorterStemmer()

# Traversing whole dataset
for i in tqdm(range(len(df['review']))):
    # Tokenization
    tokens = word_tokenize(str(df['review'][i]))
    # Converting all characters to lower case
    tokens = [w.lower() for w in tokens]
    # Remove all punctuations from sentenses
    tokens = [re_puncs.sub('', w) for w in tokens]
    # Checking all words is alphabets or not
    tokens = [i for i in tokens if i.isalpha()]
    # Removing all stop words from the sentenses
    tokens = [w for w in tokens if w not in stop_word]
    # Doing Lemmatizing of words
    tokens = [lem.lemmatize(w) for w in tokens]
    # Stemming process
    tokens = [p_stem.stem(w) for w in tokens]
    # Finally convert to string
    r = ' '.join(tokens)
    # Storing the final string into main list
    main.append(r)


100%|██████████| 1485/1485 [00:15<00:00, 93.10it/s]


In [75]:
df['translated'] = main

In [76]:
df

Unnamed: 0,title,review,rating,translated
0,Don't dare blink as you may miss something.,"Wong Kar Mun went blind at the age of two, 18 ...",9/10,wong kar mun went blind age two year later und...
1,Excellent horror film with a weak ending,Mun (Angelica Lee) is a young woman who has be...,7/10,mun angelica lee young woman blind sinc two ye...
2,A Corneal Transplant from the Twilight Zone.,This is not the first time that a movie where ...,8/10,first time movi main charact get corneal trans...
3,"One of the year's best horror movies, though t...",Even the website of this movie gave me the cre...,8/10,even websit movi gave creep turn one scariest ...
4,Good and scary!!!,"Of all the horror movie genres in existence, g...",10/10,horror movi genr exist ghost stori alway perso...
...,...,...,...,...
1480,A rewarding experience,A Tale Of Two Sisters is a heart-wrenching sto...,10/10,tale two sister heartwrench stori psycholog ho...
1481,"Creepy, Cringe, Disturbing,Twisted",Endless expectation for something to happen. C...,6/10,endless expect someth happen cring random stuf...
1482,It will make you scream like a teenage girl.,I saw this movie while I was in London a few m...,9/10,saw movi london month back nt know anyth movi ...
1483,Not as good as I hoped,I watched this film as part of the EXTREME ASI...,10/10,watch film part extrem asia season local cinem...


In [77]:
#df['translated'][88]

In [78]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['translated'])

In [79]:
print(len(tokenizer.word_counts))

12200


In [80]:
counter = 0
for key, value in tokenizer.word_counts.items():
    if value > 2400:
        print(key, value)
        counter+=1

print(counter)

film 3687
nt 2443
movi 3731
3


**grupa 1** scary/scared, afraid, fright(ened), alarmed, shocked, horror/horrified, threatening, hair-rising, horrendous

In [81]:
count1 = 0
num=tokenizer.word_counts['scari']
print(num)
count1+=num

692


In [82]:
num=tokenizer.word_counts['scare']
print(num)
count1+=num

759


In [83]:
num=tokenizer.word_counts['afraid']
print(num)
count1+=num

44


In [84]:
num=tokenizer.word_counts['fright']
print(num)
count1+=num

37


In [85]:
num=tokenizer.word_counts['frighten']
print(num)
count1+=num

210


In [86]:
num=tokenizer.word_counts['alarm']
print(num)
count1+=num

3


In [87]:
num=tokenizer.word_counts['shock']
print(num)
count1+=num

175


In [88]:
num=tokenizer.word_counts['horrifi']
print(num)
count1+=num

46


In [89]:
num=tokenizer.word_counts['horrif']
print(num)
count1+=num

37


In [90]:
num=tokenizer.word_counts['threaten']
print(num)
count1+=num

21


In [91]:
num=tokenizer.word_counts['horrend']
print(num)
count1+=num

3


In [92]:
num=tokenizer.word_counts['upset']
print(num)
count1+=num

8


In [93]:
print('WYNIK DLA GRUPY PIERWSZEJ TOOOOO',count1)

WYNIK DLA GRUPY PIERWSZEJ TOOOOO 2035


**grupa 2** fear, petrified, terror/terrified, distress, helpless(ness), anxiety/anxious, worried, apprehension, creepy, uneasy, disturbing, chilling, eerie, spooky(?), intimidating, overwhelming, unnerving

In [94]:
count2 = 0
num=tokenizer.word_counts['fear']
print(num)
count2+=num

188


In [95]:
num=tokenizer.word_counts['petrifi']
print(num)
count2+=num

1


In [96]:
num=tokenizer.word_counts['terror']
print(num)
count2+=num

114


In [97]:
num=tokenizer.word_counts['terrifi']
print(num)
count2+=num

191


In [98]:
num=tokenizer.word_counts['distress']
print(num)
count2+=num

5


In [99]:
num=tokenizer.word_counts['helpless']
print(num)
count2+=num

10


In [100]:
num=tokenizer.word_counts['worri']
print(num)
count2+=num

18


In [101]:
num=tokenizer.word_counts['creepi']
print(num)
count2+=num

511


In [102]:
num=tokenizer.word_counts['uneasi']
print(num)
count2+=num

30


In [103]:
num=tokenizer.word_counts['disturb']
print(num)
count2+=num

127


In [104]:
num=tokenizer.word_counts['chilli']
print(num)
count2+=num

1


In [105]:
num=tokenizer.word_counts['eeri']
print(num)
count2+=num

117


In [106]:
num=tokenizer.word_counts['overwhelm']
print(num)
count2+=num

15


In [107]:
num=tokenizer.word_counts['overwhelming']
print(num)
count2+=num

KeyError: 'overwhelming'

In [108]:
print('WYNIK DLA GRUPY DRUGIEJ TOOOOO',count2)

WYNIK DLA GRUPY DRUGIEJ TOOOOO 1328


**GRA W SKOJARZENIA**

In [109]:
word_model.wv.most_similar("death")

[('donor', 0.994712233543396),
 ('investigate', 0.9944313764572144),
 ('visions', 0.993080198764801),
 ('victims', 0.9923332333564758),
 ('calls', 0.9919072389602661),
 ('hospital', 0.9918200969696045),
 ('home,', 0.9914587736129761),
 ('killed', 0.9912410378456116),
 ('solve', 0.9911643862724304),
 ('died', 0.9908338189125061)]

In [114]:
word_model.wv.most_similar("ghost")

[('tale', 0.9633168578147888),
 ('eerie', 0.955857515335083),
 ('excellent', 0.955665647983551),
 ('stylish', 0.9545384049415588),
 ('cinematography', 0.9521903395652771),
 ('sound', 0.9504590034484863),
 ('vein', 0.9485344290733337),
 ('photography,', 0.9485334753990173),
 ('handles', 0.9474178552627563),
 ('intricate', 0.9473121762275696)]

In [116]:
word_model.wv.most_similar("horror")

[('Asian', 0.9131890535354614),
 ('Japanese', 0.9128168225288391),
 ("I've", 0.9082717299461365),
 ('best', 0.8879138231277466),
 ('most', 0.8818326592445374),
 ('scariest', 0.8800951838493347),
 ('fan', 0.8768458962440491),
 ('avid', 0.8659511208534241),
 ('creepiest', 0.859481155872345),
 ('horrors,', 0.8488922715187073)]

In [117]:
word_model.wv.most_similar("kill")

[('die', 0.9929628968238831),
 ('someone', 0.9914686679840088),
 ('turn', 0.9897354245185852),
 ('happen', 0.9892741441726685),
 ('help', 0.9884474277496338),
 ('pick', 0.9883645176887512),
 ('save', 0.9869269728660583),
 ('appear', 0.9865267872810364),
 ('discover', 0.9863044619560242),
 ('start', 0.985583484172821)]

In [123]:
word_model.wv.most_similar("place")

[('staring', 0.993803083896637),
 ('evil', 0.9936553835868835),
 ('ceiling', 0.9934309720993042),
 ('places', 0.9933696389198303),
 ('struggling', 0.9933278560638428),
 ('body', 0.9932346940040588),
 ('bed', 0.993134081363678),
 ('taking', 0.9927263855934143),
 ('running', 0.992688775062561),
 ("girl's", 0.992581844329834)]

In [124]:
word_model.wv.most_similar("body")

[('death,', 0.9967809319496155),
 ('Yumi', 0.9962895512580872),
 ('run', 0.996161699295044),
 ('bedroom', 0.9960645437240601),
 ('child,', 0.9956765174865723),
 ('driving', 0.9956573843955994),
 ('inside', 0.9955999851226807),
 ('standing', 0.9955510497093201),
 ('mother.', 0.9955487251281738),
 ('red', 0.9954115748405457)]

In [130]:
word_model.wv.most_similar("ending")

[('bad.', 0.9807007312774658),
 ('masterpiece.', 0.9739773869514465),
 ('film!', 0.9729151129722595),
 ('case.', 0.9685205817222595),
 ('movie?', 0.9649356603622437),
 ('Grudge', 0.9641385674476624),
 ('Ring.', 0.9637940526008606),
 ('entertaining', 0.962119460105896),
 ('slow.', 0.9618367552757263),
 ('uninteresting', 0.9614896178245544)]

In [131]:
word_model.wv.most_similar("gore")

[('blood', 0.992591381072998),
 ('cheap', 0.9914207458496094),
 ('often', 0.9904683232307434),
 ('certain', 0.9903222322463989),
 ('bloody', 0.990240216255188),
 ('scares,', 0.9893238544464111),
 ('quiet', 0.9885172843933105),
 ('depressing', 0.9883801341056824),
 ('focused', 0.9873073101043701),
 ('cheap,', 0.9871789216995239)]