In [71]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import tensorflow as tf
import tensorflow_addons as tfa

In [33]:
import ast

In [34]:

keywords_df = pd.read_csv('keywords.csv', low_memory=False)
movies_df = pd.read_csv('movies_metadata.csv',skiprows=[19730, 19731, 29503, 29504, 35587, 35588], usecols=['id', 'vote_average'])


keywords_dict = {}

for word_list in keywords_df['keywords']:
    for word in ast.literal_eval(word_list):
        keywords_dict[word['id']] = word['name']


In [35]:
train_df = keywords_df.merge(movies_df, on='id')

In [36]:
train_df = train_df.drop(train_df[train_df['keywords'] == '[]'].index, axis=0).reset_index(drop=True)
train_df['keywords'] = train_df['keywords'].apply(lambda word_list: [keywords_dict['id'] for keywords_dict in ast.literal_eval(word_list)])
train_df

Unnamed: 0,id,keywords,vote_average
0,862,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...",7.7
1,8844,"[10090, 10941, 15101, 33467, 158086, 158091]",6.9
2,15602,"[1495, 12392, 179431, 208510]",6.5
3,31357,"[818, 10131, 14768, 15160, 33455]",6.1
4,11862,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...",5.7
...,...,...,...
31651,84419,"[9748, 9826, 10714, 14512, 173245, 179083, 189...",6.3
31652,390959,[224180],7.0
31653,289923,"[616, 2035, 3754, 10714, 11800]",7.0
31654,439050,[10703],4.0


In [37]:
y = train_df.loc[:, 'vote_average']
X = train_df.loc[:, 'keywords']

In [38]:
keyword_counts = {}

for word_list in X:
    for word in word_list:
        if word in keyword_counts:
            keyword_counts[word] += 1
        else:
            keyword_counts[word] = 1

# sorteerime keyword_counts sõnastiku
keyword_counts = {key: value for key, value in sorted(keyword_counts.items(), key=lambda item: item[1], reverse=True)}

In [39]:
keyword_counts

{187056: 3119,
 10183: 1934,
 9826: 1310,
 818: 836,
 4344: 734,
 572: 685,
 14819: 651,
 2483: 636,
 5565: 633,
 9748: 626,
 9937: 590,
 9673: 574,
 293: 565,
 6075: 544,
 6149: 456,
 13130: 441,
 9663: 441,
 179431: 440,
 6054: 415,
 1956: 394,
 14964: 360,
 378: 351,
 9716: 351,
 6270: 319,
 779: 314,
 236: 312,
 195402: 307,
 570: 306,
 1930: 306,
 154802: 306,
 18035: 295,
 10714: 293,
 1299: 285,
 9951: 283,
 4565: 273,
 90: 270,
 242: 267,
 11221: 266,
 237: 263,
 214549: 261,
 6038: 260,
 207317: 257,
 10292: 246,
 34079: 245,
 12377: 245,
 13142: 243,
 1415: 239,
 212: 238,
 9840: 237,
 549: 234,
 703: 233,
 179430: 231,
 255: 231,
 642: 225,
 3133: 225,
 494: 224,
 13027: 224,
 12670: 222,
 10685: 220,
 15162: 216,
 10508: 214,
 65: 213,
 6091: 209,
 2343: 208,
 11612: 208,
 9714: 200,
 6027: 198,
 13005: 198,
 931: 197,
 9672: 196,
 162846: 196,
 8508: 193,
 2041: 191,
 14512: 186,
 470: 186,
 158718: 185,
 5600: 183,
 233: 183,
 5340: 182,
 10594: 180,
 10683: 180,
 9715: 1

In [40]:
vocabulary = list(keyword_counts)[0:1000]

In [41]:
for word_list in X:
    word_list[:] = [word for word in word_list if word in vocabulary]

In [42]:
X

0                 [931, 5202, 6054, 9713, 9823]
1                                [10941, 15101]
2                               [12392, 179431]
3                    [818, 10131, 14768, 15160]
4        [1009, 1599, 4995, 5600, 10707, 13149]
                          ...                  
31651                [9748, 9826, 10714, 14512]
31652                                        []
31653           [616, 2035, 3754, 10714, 11800]
31654                                        []
31655                                    [2679]
Name: keywords, Length: 31656, dtype: object

In [43]:
null_indices = set()

for i, keywords in enumerate(X):
    if not keywords:
        null_indices.add(i)

In [44]:
X = X.drop(null_indices, axis=0).reset_index(drop=True)
y = y.drop(null_indices, axis=0).reset_index(drop=True)

In [45]:
X

0                 [931, 5202, 6054, 9713, 9823]
1                                [10941, 15101]
2                               [12392, 179431]
3                    [818, 10131, 14768, 15160]
4        [1009, 1599, 4995, 5600, 10707, 13149]
                          ...                  
28150                                    [3335]
28151              [5970, 6075, 154802, 214549]
28152                [9748, 9826, 10714, 14512]
28153           [616, 2035, 3754, 10714, 11800]
28154                                    [2679]
Name: keywords, Length: 28155, dtype: object

In [46]:
keyword_column_names = []

for word_list in X:
    for word in word_list:
        if word not in keyword_column_names:
            keyword_column_names.append(word)

In [47]:
keyword_column_names = list(map(lambda x: keywords_dict[x], keyword_column_names))

In [48]:
mlb = MultiLabelBinarizer()
X = pd.DataFrame(mlb.fit_transform(X), columns=keyword_column_names)

In [49]:
X

Unnamed: 0,jealousy,boy,friendship,friends,rivalry,disappearance,based on children's book,best friend,duringcreditsstinger,based on novel,...,greece,south korea,infection,tv movie,miniseries,korea,korean movie,disney short,mumblegore,malayalam
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
28152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=10)

In [51]:
X.shape

(28155, 1000)

In [66]:
inputs = tf.keras.Input(shape=(1000,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer='adam', loss='mse')

# hetkel loss on vaga halb
history = model.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=10, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [68]:
# epochi suurendamisega saab uurida loss over timei
fig = px.line(history.history, y=['loss', 'val_loss'], labels={'x': 'Epoch', 'y': 'Loss'}, title='Loss over time')
fig.show()

In [69]:
np.argmin(history.history['val_loss'])

2

In [70]:
# see vaartus voiks voimalikult vaike olla
model.evaluate(X_test, y_test)



3.0943708419799805

In [55]:
y_preds = model.predict(X_test)
y_preds



array([[5.695443 ],
       [5.703732 ],
       [6.24104  ],
       ...,
       [7.391318 ],
       [6.0223646],
       [5.337206 ]], dtype=float32)

In [56]:
y_test = y_test.to_numpy()
y_preds = np.squeeze(y_preds)

In [59]:
y_test

array([2. , 6.3, 6.2, ..., 5.2, 5.3, 5.6])

In [58]:
y_preds

array([5.695443 , 5.703732 , 6.24104  , ..., 7.391318 , 6.0223646,
       5.337206 ], dtype=float32)

In [62]:
rsquare = tfa.metrics.RSquare()

rsquare.update_state(y_test, y_preds)

In [63]:
# halb skoor
print('R^2 score:', rsquare.result().numpy())

R^2 score: -0.009829879


In [72]:
pca = PCA(n_components=2)
X

Unnamed: 0,jealousy,boy,friendship,friends,rivalry,disappearance,based on children's book,best friend,duringcreditsstinger,based on novel,...,greece,south korea,infection,tv movie,miniseries,korea,korean movie,disney short,mumblegore,malayalam
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
28152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
X_reduced = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2'])
X_reduced

Unnamed: 0,PC1,PC2
0,-0.108067,-0.079929
1,-0.092033,-0.103387
2,-0.094887,-0.113698
3,-0.115657,-0.062532
4,-0.084126,-0.091150
...,...,...
28150,-0.092447,-0.097905
28151,-0.114523,-0.159698
28152,-0.236230,0.224359
28153,-0.115000,-0.073084


In [77]:
X

Unnamed: 0,jealousy,boy,friendship,friends,rivalry,disappearance,based on children's book,best friend,duringcreditsstinger,based on novel,...,greece,south korea,infection,tv movie,miniseries,korea,korean movie,disney short,mumblegore,malayalam
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
28152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
fig = px.scatter(X_reduced, x='PC1', y='PC2')
fig.show()