In [121]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import tensorflow as tf
import tensorflow_addons as tfa

import ast

In [122]:
keywords_df = pd.read_csv('keywords.csv', low_memory=False)
movies_df = pd.read_csv('movies_metadata.csv',skiprows=[19730, 19731, 29503, 29504, 35587, 35588], usecols=['id', 'vote_average'])

# Preprocessing data

In [123]:
keywords_dict = {}

for word_list in keywords_df['keywords']:
    for word in ast.literal_eval(word_list):
        keywords_dict[word['id']] = word['name']

In [124]:
train_df = keywords_df.merge(movies_df, on='id')

In [125]:
train_df.drop(train_df[train_df['vote_average'] == 0.0].index, inplace=True)

In [126]:
train_df = train_df.drop(train_df[train_df['keywords'] == '[]'].index, axis=0).reset_index(drop=True)
train_df['keywords'] = train_df['keywords'].apply(lambda word_list: [keywords_dict['id'] for keywords_dict in ast.literal_eval(word_list)])
train_df

Unnamed: 0,id,keywords,vote_average
0,862,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...",7.7
1,8844,"[10090, 10941, 15101, 33467, 158086, 158091]",6.9
2,15602,"[1495, 12392, 179431, 208510]",6.5
3,31357,"[818, 10131, 14768, 15160, 33455]",6.1
4,11862,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...",5.7
...,...,...,...
30299,84419,"[9748, 9826, 10714, 14512, 173245, 179083, 189...",6.3
30300,390959,[224180],7.0
30301,289923,"[616, 2035, 3754, 10714, 11800]",7.0
30302,439050,[10703],4.0


In [127]:
ratings_dict = {'1-3': 0, '3-5': 0, '5-7': 0, '7-9': 0, '9-10': 0}

for rating in train_df['vote_average']:
    if rating < 3:
        ratings_dict['1-3'] += 1
    elif rating >= 3 and rating < 5:
        ratings_dict['3-5'] += 1
    elif rating >= 5 and rating < 7:
        ratings_dict['5-7'] += 1
    elif rating >= 7 and rating < 9:
        ratings_dict['7-9'] += 1
    elif rating >= 9:
        ratings_dict['9-10'] += 1

ratings_dict

{'1-3': 412, '3-5': 4009, '5-7': 19056, '7-9': 6621, '9-10': 206}

# Splitting data

In [128]:
y = train_df.loc[:, 'vote_average']
X = train_df.loc[:, 'keywords']

In [129]:
keyword_counts = {}

for word_list in X:
    for word in word_list:
        if word in keyword_counts:
            keyword_counts[word] += 1
        else:
            keyword_counts[word] = 1

# sorteerime keyword_counts sõnastiku
keyword_counts = {key: value for key, value in sorted(keyword_counts.items(), key=lambda item: item[1], reverse=True)}

In [130]:
keyword_counts

{187056: 2894,
 10183: 1893,
 9826: 1279,
 818: 804,
 4344: 680,
 572: 680,
 14819: 650,
 2483: 632,
 9748: 614,
 5565: 606,
 9937: 588,
 9673: 559,
 293: 553,
 6075: 527,
 6149: 447,
 179431: 439,
 9663: 435,
 13130: 434,
 6054: 410,
 1956: 383,
 14964: 351,
 378: 343,
 6270: 315,
 779: 311,
 9716: 310,
 570: 305,
 236: 304,
 195402: 304,
 1930: 298,
 10714: 289,
 18035: 289,
 154802: 289,
 1299: 282,
 9951: 279,
 4565: 272,
 11221: 266,
 90: 265,
 242: 263,
 237: 257,
 214549: 256,
 6038: 251,
 207317: 251,
 10292: 245,
 34079: 243,
 12377: 241,
 212: 236,
 13142: 235,
 1415: 234,
 179430: 231,
 549: 230,
 703: 229,
 9840: 229,
 255: 226,
 3133: 224,
 12670: 221,
 13027: 221,
 642: 220,
 10685: 219,
 494: 218,
 15162: 215,
 65: 213,
 10508: 209,
 2343: 206,
 6091: 204,
 11612: 203,
 9714: 200,
 931: 194,
 6027: 194,
 9672: 194,
 162846: 193,
 13005: 192,
 8508: 190,
 2041: 185,
 14512: 181,
 233: 181,
 5600: 180,
 10594: 178,
 5340: 178,
 158718: 177,
 417: 176,
 470: 176,
 10180: 17

In [131]:
vocabulary = list(keyword_counts)[0:1000]

for word_list in X:
    word_list[:] = [word for word in word_list if word in vocabulary]

X

0                 [931, 5202, 6054, 9713, 9823]
1                                [10941, 15101]
2                               [12392, 179431]
3                    [818, 10131, 14768, 15160]
4        [1009, 1599, 4995, 5600, 10707, 13149]
                          ...                  
30299                [9748, 9826, 10714, 14512]
30300                                        []
30301           [616, 2035, 3754, 10714, 11800]
30302                                        []
30303                                    [2679]
Name: keywords, Length: 30304, dtype: object

In [132]:
null_indices = set()

for i, keywords in enumerate(X):
    if not keywords:
        null_indices.add(i)

X = X.drop(null_indices, axis=0).reset_index(drop=True)
y = y.drop(null_indices, axis=0).reset_index(drop=True)

X

0                 [931, 5202, 6054, 9713, 9823]
1                                [10941, 15101]
2                               [12392, 179431]
3                    [818, 10131, 14768, 15160]
4        [1009, 1599, 4995, 5600, 10707, 13149]
                          ...                  
27012                                    [3335]
27013              [5970, 6075, 154802, 214549]
27014                [9748, 9826, 10714, 14512]
27015           [616, 2035, 3754, 10714, 11800]
27016                                    [2679]
Name: keywords, Length: 27017, dtype: object

In [133]:
keyword_lists = []

for row in X:
    word_list = [keywords_dict[id] for id in row]
    keyword_lists.append(word_list)

In [134]:
keyword_column_names = []

for word_list in X:
    for word in word_list:
        if word not in keyword_column_names:
            keyword_column_names.append(word)

keyword_column_names = list(map(lambda x: keywords_dict[x], keyword_column_names))

In [135]:
mlb = MultiLabelBinarizer()
X = pd.DataFrame(mlb.fit_transform(X), columns=keyword_column_names)

X

Unnamed: 0,jealousy,boy,friendship,friends,rivalry,disappearance,based on children's book,best friend,duringcreditsstinger,based on novel,...,south korea,zombie apocalypse,infection,tv movie,miniseries,korea,korean movie,disney short,mumblegore,malayalam
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
27014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=10)

X.shape

(27017, 1000)

In [143]:
from keras.optimizers import Adam

# 1. Tune the hyperparameters such as the number of layers, activation functions, learning rate, etc.
# 2. Try different types of neural networks such as convolutional neural networks (CNNs).
# 3. Try adding additional features to the data set such as genre, budget, box office receipts, etc.
# 4. Try different optimizers such as Adam or RMSProp.
# 5. Increase the training data or use data augmentation techniques.
# 6. Regularize the model by adding dropout layers.
# 7. Try using ensemble methods such as bagging or boosting.

inputs = tf.keras.Input(shape=(1000,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse')

# hetkel loss on vaga halb
history = model.fit(X_train, y_train, validation_split=0.1, batch_size=32, epochs=10, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Results

In [58]:
# epochi suurendamisega saab uurida loss over timei
fig = px.line(history.history, y=['loss', 'val_loss'], labels={'x': 'Epoch', 'y': 'Loss'}, title='Loss over time')
fig.show()

In [59]:
np.argmin(history.history['val_loss'])

7

In [60]:
# see vaartus voiks voimalikult vaike olla
model.evaluate(X_test, y_test)



1.255107045173645

In [61]:
y_preds = model.predict(X_test)
y_preds



array([[6.556886 ],
       [6.022319 ],
       [6.5281014],
       ...,
       [5.669161 ],
       [5.9236794],
       [5.6075497]], dtype=float32)

In [64]:
y_test = y_test.to_numpy()
y_preds = np.squeeze(y_preds)

y_test

array([7.1, 6. , 6.7, ..., 4.4, 7. , 6.7])

In [65]:
rsquare = tfa.metrics.RSquare()
rsquare.update_state(y_test, y_preds)

# halb skoor
print('R^2 score:', rsquare.result().numpy())

R^2 score: 0.08003199


# Post-training analysis

In [26]:
pca = PCA(n_components=2)
X

Unnamed: 0,jealousy,boy,friendship,friends,rivalry,disappearance,based on children's book,best friend,duringcreditsstinger,based on novel,...,south korea,zombie apocalypse,infection,tv movie,miniseries,korea,korean movie,disney short,mumblegore,malayalam
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
27014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X_reduced = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2'])
X_reduced['keywords'] = keyword_lists
X_reduced['keywords'] = X_reduced['keywords'].astype(str)
X_reduced

Unnamed: 0,PC1,PC2,keywords
0,-0.103422,-0.082025,"['jealousy', 'boy', 'friendship', 'friends', '..."
1,-0.087987,-0.105951,"['disappearance', ""based on children's book""]"
2,-0.090633,-0.117597,"['best friend', 'duringcreditsstinger']"
3,-0.110069,-0.062569,"['based on novel', 'interracial relationship',..."
4,-0.079166,-0.094853,"['baby', 'midlife crisis', 'aging', 'daughter'..."
...,...,...,...
27012,-0.088469,-0.100322,['halloween']
27013,-0.110042,-0.162536,"['wrestling', 'sport', 'silent film', 'short']"
27014,-0.240398,0.228874,"['revenge', 'murder', 'serial killer', 'new yo..."
27015,-0.112013,-0.074846,"['witch', 'mythology', 'legend', 'serial kille..."


In [28]:
fig = px.scatter(X_reduced, x='PC1', y='PC2', hover_data={'PC1': False, 'PC2': False, 'keywords': True})
fig.show()

# Predicting with LinearRegression

In [231]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score

In [225]:
train_df2 = train_df

for i, row in train_df2.iterrows():
    word_list = [keywords_dict[id] for id in row.keywords]
    word_list = ", ".join(word_list)
    train_df2.at[i, 'keywords'] = word_list

train_df2

Unnamed: 0,id,keywords,vote_average
0,862,"jealousy, boy, friendship, friends, rivalry",7.7
1,8844,"disappearance, based on children's book",6.9
2,15602,"best friend, duringcreditsstinger",6.5
3,31357,"based on novel, interracial relationship, sing...",6.1
4,11862,"baby, midlife crisis, aging, daughter, mother ...",5.7
...,...,...,...
30299,84419,"revenge, murder, serial killer, new york city",6.3
30300,390959,,7.0
30301,289923,"witch, mythology, legend, serial killer, mocku...",7.0
30302,439050,,4.0


In [226]:
tfidf = TfidfVectorizer(stop_words='english')

X_train, X_test, y_train, y_test = train_test_split(train_df2['keywords'], train_df2['vote_average'], train_size=0.7, random_state=10)

# Create the feature matrix by fitting and transforming the movie keywords
X = tfidf.fit_transform(X_train)

# Create the target vector
y = y_train

# Train the model
model = LinearRegression().fit(X, y)

# Test the model by predicting the rating for a given movie
movie_vector = tfidf.transform(X_test)
y_preds = model.predict(movie_vector)

y_preds

array([6.22623276, 6.13392262, 6.69246165, ..., 6.26953259, 5.61768608,
       5.65882277])

In [227]:
y_test

1415     6.4
18823    6.9
4491     5.2
23504    6.0
28493    6.5
        ... 
10307    5.4
29460    7.2
9029     7.0
2694     4.6
6504     6.1
Name: vote_average, Length: 9092, dtype: float64

In [228]:
y_test = y_test.to_numpy()
y_preds = np.squeeze(y_preds)

In [232]:
rsquare = tfa.metrics.RSquare()
rsquare.update_state(y_test, y_preds)

print('Explained variance score:', explained_variance_score(y_test, y_preds))

# halb skoor
print('R^2 score:', rsquare.result().numpy())

Explained variance score: 0.06948570787539898
R^2 score: 0.06941557
