In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import BatchNormalization

es = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

EPOCHS = 1000

In [2]:
try:
    df = pd.read_csv('all_razem.csv', sep=",", on_bad_lines='skip')  # Skip bad lines
    print("CSV file read successfully, skipping problematic lines.")
except pd.errors.ParserError as e:
    print(f"Error reading CSV: {e}")
    # Find problematic lines (this might print multiple lines)
    with open('file.csv', 'r') as f:
        for i, line in enumerate(f):
            if len(line.split(',')) != 4:  # Assuming expected number of fields is 4
                print(f"Problematic line ({i+1}): {line}")

CSV file read successfully, skipping problematic lines.


In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df

Unnamed: 0,title,review,rating
0,Essentially Lanthimosian,"Over the last twenty years, Yorgos Lanthimos h...",8/10
1,Classic cynical lanthimos,This feels like the summer movie for lanthimos...,8/10
2,It's a refreshing new chapter to Yorgos' filmo...,"""Kinds of Kindness"" is Yorgos' latest cinemati...",8/10
3,Review,"Concerning the first story, I think that it pa...",10/10
4,Lanthimos & Filippou: A Unique Kind of Duo,"Yorgos Lanthimos returns with a vengeance, del...",7/10
...,...,...,...
5240,Brings back the kid in all of us.,"You ever have an imaginary friend? An ""IF""? Mo...",10/10
5241,"Fun, deep and impactful.",This movie touched the deepest part of my hear...,10/10
5242,A fun and emotional time,POSITIVES:1) The two lead performances are bot...,8/10
5243,Good movie but not for little kids,"I really enjoyed this movie. BUT, I think a lo...",9/10


In [5]:
df['review'][88]

'This was a solid WWII film that just wants to have fun, and I have a place in my heart for films like that. Its great cast distracts from the weak characterization in the script. Standouts are Henry Cavill (of course), Eiza Gonzales and Alan Ritchson. It\'s a very plot driven movie, but the plot is interesting (especially considering it is loosely based on real events). If the script made us care more about the people, this would have been great movie. Instead everyone comes off as "movie characters;" I\'m sure the actual people involved were far more interesting than that. In some ways, the movie highlights a fascinating mission from WWII while doing a disservice to the men involved.Those script issues aside, the movie\'s direction and tone are very fun. There\'s a great musical sequence I very much enjoyed and I did find myself invested in the mission as the third act neared. Don\'t expect to be on the edge of your seat as this is less about suspense and all about style. But it was 

In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from tqdm import tqdm
from nltk import sent_tokenize,word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
main = []

# Storing all punctuations using RE library like !;,"% etc
re_puncs = re.compile('[%s]' % re.escape(string.punctuation))
# Storing all stop words like a, an, the, when, there, this etc
stop_word  = set(stopwords.words('english'))
stop_word.add("im")
# print(stop_word)


In [9]:
sentences_listed = [str(line).split(" ") for line in df['review']]

import gensim

word_model = gensim.models.Word2Vec(sentences = sentences_listed, vector_size = 100)
words = list(word_model.wv.key_to_index)
print(len(words))

12568


In [10]:
!pip install gensim



In [11]:
#word_model.wv.most_similar("actress")

In [12]:
#word_model.wv.most_similar("death")

In [13]:
# Making Lemmatizing object
lem = WordNetLemmatizer()
# Using Porter Stemmer
p_stem = PorterStemmer()

# Traversing whole dataset
for i in tqdm(range(len(df['review']))):
    # Tokenization
    tokens = word_tokenize(str(df['review'][i]))
    # Converting all characters to lower case
    tokens = [w.lower() for w in tokens]
    # Remove all punctuations from sentenses
    tokens = [re_puncs.sub('', w) for w in tokens]
    # Checking all words is alphabets or not
    tokens = [i for i in tokens if i.isalpha()]
    # Removing all stop words from the sentenses
    tokens = [w for w in tokens if w not in stop_word]
    # Doing Lemmatizing of words
    tokens = [lem.lemmatize(w) for w in tokens]
    # Stemming process
    tokens = [p_stem.stem(w) for w in tokens]
    # Finally convert to string
    r = ' '.join(tokens)
    # Storing the final string into main list
    main.append(r)


100%|██████████| 5245/5245 [00:39<00:00, 133.21it/s]


In [14]:
df['translated'] = main

In [15]:
df

Unnamed: 0,title,review,rating,translated
0,Essentially Lanthimosian,"Over the last twenty years, Yorgos Lanthimos h...",8/10,last twenti year yorgo lanthimo cultiv approac...
1,Classic cynical lanthimos,This feels like the summer movie for lanthimos...,8/10,feel like summer movi lanthimo fan piec art le...
2,It's a refreshing new chapter to Yorgos' filmo...,"""Kinds of Kindness"" is Yorgos' latest cinemati...",8/10,kind kind yorgo latest cinemat achiev self awa...
3,Review,"Concerning the first story, I think that it pa...",10/10,concern first stori think paint pictur scare m...
4,Lanthimos & Filippou: A Unique Kind of Duo,"Yorgos Lanthimos returns with a vengeance, del...",7/10,yorgo lanthimo return vengeanc deliv film feel...
...,...,...,...,...
5240,Brings back the kid in all of us.,"You ever have an imaginary friend? An ""IF""? Mo...",10/10,ever imaginari friend u young kid one two mani...
5241,"Fun, deep and impactful.",This movie touched the deepest part of my hear...,10/10,movi touch deepest part heart remind like kid ...
5242,A fun and emotional time,POSITIVES:1) The two lead performances are bot...,8/10,two lead perform good bea cal realli engag ful...
5243,Good movie but not for little kids,"I really enjoyed this movie. BUT, I think a lo...",9/10,realli enjoy movi think lot parent small child...


In [16]:
#df['translated'][88]

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['translated'])

In [18]:
print(len(tokenizer.word_counts))

27122


In [19]:
counter = 0
for key, value in tokenizer.word_counts.items():
    if value > 2400:
        print(key, value)
        counter+=1

print(counter)

like 4904
one 4326
film 8747
charact 4539
stori 3860
make 3035
see 2635
good 3232
nt 6473
feel 2530
movi 12384
scene 2473
realli 2707
time 2924
emot 2710
watch 3139
action 2764
17


**grupa 1** fright(ened), shocked/shocking, horror/horrified, horrendous, overwhelming, terror/terrified, disturbing, bloody, disgust(ing), brutal, gory/gore, barbarous, repulsed/-ing, gross(ed out)

In [20]:
count1 = 0
num=tokenizer.word_counts['fright']
print(num)
count1+=num

5


In [21]:
num=tokenizer.word_counts['frighten']
print(num)
count1+=num

21


In [22]:
num=tokenizer.word_counts['shock']
print(num)
count1+=num

104


In [23]:
num=tokenizer.word_counts['horrifi']
print(num)
count1+=num

9


In [24]:
num=tokenizer.word_counts['horrif']
print(num)
count1+=num

20


In [25]:
num=tokenizer.word_counts['horrend']
print(num)
count1+=num

11


In [26]:
num=tokenizer.word_counts['overwhelm']
print(num)
count1+=num

66


In [27]:
num=tokenizer.word_counts['overwhelming']
print(num)
count1+=num

KeyError: 'overwhelming'

In [None]:
num=tokenizer.word_counts['disturb']
print(num)
count1+=num

In [None]:
num=tokenizer.word_counts['terror']
print(num)
count1+=num

In [28]:
num=tokenizer.word_counts['blood']
print(num)
count1+=num

93


In [29]:
num=tokenizer.word_counts['terrifi']
print(num)
count1+=num

49


In [30]:
num=tokenizer.word_counts['disgust']
print(num)
count1+=num

230


In [31]:
num=tokenizer.word_counts['brutal']
print(num)
count1+=num

89


In [32]:
num=tokenizer.word_counts['gore']
print(num)
count1+=num

30


In [33]:
num=tokenizer.word_counts['repuls']
print(num)
count1+=num

3


In [34]:
num=tokenizer.word_counts['gross']
print(num)
count1+=num

24


In [35]:
print('WYNIK DLA GRUPY PIERWSZEJ TOOOOO',count1)

WYNIK DLA GRUPY PIERWSZEJ TOOOOO 754


**grupa 2** afraid, alarmed, threatening, hair-rising, distress, helpless(ness), anxiety/anxious, worried, apprehension, uneasy, chilling, eerie, intimidating, unnerving, appalling

In [36]:
count2=0
num=tokenizer.word_counts['distress']
print(num)
count2+=num

8


In [37]:
num=tokenizer.word_counts['afraid']
print(num)
count2+=num

29


In [38]:
num=tokenizer.word_counts['alarm']
print(num)
count2+=num

16


In [39]:
num=tokenizer.word_counts['threaten']
print(num)
count2+=num

28


In [40]:
num=tokenizer.word_counts['anxious']
print(num)
count2+=num

1


In [41]:
num=tokenizer.word_counts['anxieti']
print(num)
count2+=num

503


In [42]:
num=tokenizer.word_counts['worri']
print(num)
count2+=num

95


In [43]:
num=tokenizer.word_counts['apprehens']
print(num)
count2+=num

3


In [44]:
num=tokenizer.word_counts['uneasi']
print(num)
count2+=num

1


In [45]:
num=tokenizer.word_counts['chilli']
print(num)
count2+=num

1


In [46]:
num=tokenizer.word_counts['eeri']
print(num)
count2+=num

31


In [47]:
num=tokenizer.word_counts['helpless']
print(num)
count2+=num

4


In [48]:
num=tokenizer.word_counts['unnerv']
print(num)
count2+=num

7


In [49]:
num=tokenizer.word_counts['appal']
print(num)
count2+=num

9


In [50]:
num=tokenizer.word_counts['fear']
print(num)
count2+=num

328


In [51]:
print('WYNIK DLA GRUPY DRUGIEJ TOOOOO',count2)

WYNIK DLA GRUPY DRUGIEJ TOOOOO 1064
