In [167]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [143]:
df = pd.read_csv('imdb_top_1000.csv')
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [144]:
# select the relevant columns
df_select = df[['Overview', 'Genre']]
df_select.head()

Unnamed: 0,Overview,Genre
0,Two imprisoned men bond over a number of years...,Drama
1,An organized crime dynasty's aging patriarch t...,"Crime, Drama"
2,When the menace known as the Joker wreaks havo...,"Action, Crime, Drama"
3,The early life and career of Vito Corleone in ...,"Crime, Drama"
4,A jury holdout attempts to prevent a miscarria...,"Crime, Drama"


In [145]:
# select unique genres
unique_genres = []
for row in df_select['Genre']:
    row_genre = row.split(', ')
    unique_genres += [genre for genre in row_genre if genre not in unique_genres]
    
unique_genres.sort()
print(len(unique_genres))

21


In [163]:
# One hot encode the genres
def one_hot_encode(genres, genre):
    if genre in genres:
        return 1
    else:
        return 0
    
df_data = df_select.copy()
    
df_data['Genre'] = df_data['Genre'].str.split(', ')
df_data = df_data.dropna(axis=1)

for genre in unique_genres:
    df_data[genre] = df_data['Genre'].apply(lambda x: one_hot_encode(x, genre))

df_data.head()

Unnamed: 0,Overview,Genre,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Two imprisoned men bond over a number of years...,[Drama],0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,An organized crime dynasty's aging patriarch t...,"[Crime, Drama]",0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,When the menace known as the Joker wreaks havo...,"[Action, Crime, Drama]",1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,The early life and career of Vito Corleone in ...,"[Crime, Drama]",0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,A jury holdout attempts to prevent a miscarria...,"[Crime, Drama]",0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [164]:
# drop unnecessary 'Genre' column
df_data.drop(columns=['Genre'], inplace=True)
df_data.head()

Unnamed: 0,Overview,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Two imprisoned men bond over a number of years...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,An organized crime dynasty's aging patriarch t...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,When the menace known as the Joker wreaks havo...,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The early life and career of Vito Corleone in ...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A jury holdout attempts to prevent a miscarria...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
# create corpus and prepare text for TfidfVectorizer
corpus = []

for text in df_data['Overview']:
    appended_text = text.lower()
    appended_text = ''.join(e for e in appended_text if e.isalnum() or e.isspace())
    corpus.append(appended_text)
    
print(corpus)

['two imprisoned men bond over a number of years finding solace and eventual redemption through acts of common decency', 'an organized crime dynastys aging patriarch transfers control of his clandestine empire to his reluctant son', 'when the menace known as the joker wreaks havoc and chaos on the people of gotham batman must accept one of the greatest psychological and physical tests of his ability to fight injustice', 'the early life and career of vito corleone in 1920s new york city is portrayed while his son michael expands and tightens his grip on the family crime syndicate', 'a jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider the evidence', 'gandalf and aragorn lead the world of men against saurons army to draw his gaze from frodo and sam as they approach mount doom with the one ring', 'the lives of two mob hitmen a boxer a gangster and his wife and a pair of diner bandits intertwine in four tales of violence and redemption', 'in g

In [175]:
import tensorflow_text as text
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split

In [176]:
X = df_data['Overview']
y = df_data.iloc[:, 1:-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.75)

In [177]:
# BERT layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/3",
    trainable=True)
outputs = encoder(encoder_inputs)

# neural network layers
layers = tf.keras.layers.Dropout(0.1)(outputs['pooled_output'])
layers = tf.keras.layers.Dense(20, activation='sigmoid')(layers)

# model
model = tf.keras.Model(inputs=text_input, outputs=layers)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [178]:
model.compile(
    optimizer='adam',
    loss=CategoricalCrossentropy(),
    metrics=['accuracy']
)

In [179]:
model.fit(
    X_train, y_train,
    epochs=10,
)

Epoch 1/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 349ms/step - accuracy: 0.0557 - loss: 7.7195 - val_accuracy: 0.2680 - val_loss: 7.6830
Epoch 2/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.3549 - loss: 7.7166 - val_accuracy: 0.2880 - val_loss: 7.6471
Epoch 3/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.3078 - loss: 7.6688 - val_accuracy: 0.2840 - val_loss: 7.5969
Epoch 4/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.2945 - loss: 7.5728 - val_accuracy: 0.2840 - val_loss: 7.5297
Epoch 5/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.2953 - loss: 7.4765 - val_accuracy: 0.2840 - val_loss: 7.4402
Epoch 6/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.2921 - loss: 7.3598 - val_accuracy: 0.2840 - val_loss: 7.3238
Epoch 7/200
[1m2/2[0m [32m━━━━━━━━━━

In [129]:
# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_valid, y_valid)

print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2791 - loss: 7.2010 
Validation Loss: 7.062303066253662
Validation Accuracy: 0.2840000092983246
