In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import torch
import torch.nn as nn
import math
from datetime import datetime

In [2]:
dir_1 = 'datasets/'

In [3]:
df_1a = pd.read_csv(dir_1 + 'movie_meta_data.csv')
df_2a = pd.read_csv(dir_1 + 'imdb-movies.csv')

In [5]:
print(df_1a.shape)
print(df_2a.shape)

(2858, 25)
(10866, 21)


In [6]:
df_3a = pd.merge(df_1a, df_2a, left_on='title', right_on='original_title')
df_3a.shape

(1968, 46)

In [7]:
df_3a.columns

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget_x', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast_x', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords_x', 'genres_x', 'taglines',
       'synopsis', 'id', 'imdb_id', 'popularity', 'budget_y', 'revenue',
       'original_title', 'cast_y', 'homepage', 'director', 'tagline',
       'keywords_y', 'overview', 'runtime', 'genres_y', 'production_companies',
       'release_date', 'vote_count', 'vote_average', 'release_year',
       'budget_adj', 'revenue_adj'],
      dtype='object')

In [8]:
list_col = [    
    'budget_y',
    'writers', 'directors', 'producers', 'cast_x',
    'genres_x',    
    'metascore', 'imdb user rating',        
    'release_date',
    'revenue',
]

In [9]:
df_3b = df_3a[list_col]

In [10]:
df_3c = df_3b.drop(df_3b[df_3b['budget_y'] < 0].index)
df_3d = df_3c.drop(df_3c[df_3c['revenue'] < 0].index)
df_3e = df_3d.drop(df_3d[(df_3d['metascore'] < 0) | (df_3d['metascore'] > 100)].index)
df_3f = df_3e.drop(df_3e[(df_3e['imdb user rating'] < 1) | (df_3e['imdb user rating'] > 10)].index)

In [11]:
df_3b.shape, df_3c.shape, df_3d.shape, df_3e.shape, df_3f.shape

((1968, 10), (1968, 10), (1968, 10), (1764, 10), (1764, 10))

In [12]:
df_3f['writers'] = df_3f['writers'].apply(lambda x: str(x).split(','))
df_3f['writers'] = df_3f['writers'].apply(lambda x: [writer.strip() for writer in x])
col_writers = list(df_3f['writers'])
d_writer_vs_index = {}
iw=0
for writers in col_writers:
    for w in writers:
        if w not in d_writer_vs_index.keys():
            d_writer_vs_index[w] = iw
            iw+=1
df_3f['writers'] = df_3f['writers'].apply(lambda x: [d_writer_vs_index[writer] for writer in x])
w_vocab_size = len(d_writer_vs_index.keys()) + 1
w_embedding_dim = math.floor(math.log2(w_vocab_size) + 1)
w_embedding_layer = nn.Embedding(w_vocab_size, w_embedding_dim) 
df_3f['writers'] = df_3f['writers'].apply(lambda x: [w_embedding_layer(torch.tensor(iw)).detach().numpy() for iw in x])
df_3f['writers'] = df_3f['writers'].apply(lambda x: np.mean(np.array(x), axis=0))

In [13]:
df_3f['directors'] = df_3f['directors'].apply(lambda x: str(x).split(','))
df_3f['directors'] = df_3f['directors'].apply(lambda x: [director.strip() for director in x])
col_directors = list(df_3f['directors'])
d_director_vs_index = {}
idirector=0
for directors in col_directors:
    for d in directors:
        if d not in d_director_vs_index.keys():
            d_director_vs_index[d] = idirector
            idirector+=1
df_3f['directors'] = df_3f['directors'].apply(lambda x: [d_director_vs_index[director] for director in x])
d_vocab_size = len(d_director_vs_index.keys()) + 1
d_embedding_dim = math.floor(math.log2(d_vocab_size) + 1)
d_embedding_layer = nn.Embedding(d_vocab_size, d_embedding_dim) 
df_3f['directors'] = df_3f['directors'].apply(lambda x: [d_embedding_layer(torch.tensor(idirector)).detach().numpy() for idirector in x])
df_3f['directors'] = df_3f['directors'].apply(lambda x: np.mean(np.array(x), axis=0))

In [14]:
df_3f['producers'] = df_3f['producers'].apply(lambda x: str(x).split(','))
df_3f['producers'] = df_3f['producers'].apply(lambda x: [producer.strip() for producer in x])
col_p = list(df_3f['producers'])
d_p_vs_index = {}
ip=0
for ps in col_p:
    for p in ps:
        if p not in d_p_vs_index.keys():
            d_p_vs_index[p] = ip
            ip+=1
df_3f['producers'] = df_3f['producers'].apply(lambda x: [d_p_vs_index[p] for p in x])
p_vocab_size = len(d_p_vs_index.keys()) + 1
p_embedding_dim = math.floor(math.log2(p_vocab_size) + 1)
p_embedding_layer = nn.Embedding(p_vocab_size, p_embedding_dim) 
df_3f['producers'] = df_3f['producers'].apply(lambda x: [p_embedding_layer(torch.tensor(ip)).detach().numpy() for ip in x])
df_3f['producers'] = df_3f['producers'].apply(lambda x: np.mean(np.array(x), axis=0))

In [15]:
df_3f['cast_x'] = df_3f['cast_x'].apply(lambda x: str(x).split(','))
df_3f['cast_x'] = df_3f['cast_x'].apply(lambda x: [cast.strip() for cast in x])
col_c = list(df_3f['cast_x'])
d_c_vs_index = {}
ic=0
for cs in col_c:
    for c in cs:
        if c not in d_c_vs_index.keys():
            d_c_vs_index[c] = ic
            ic+=1
df_3f['cast_x'] = df_3f['cast_x'].apply(lambda x: [d_c_vs_index[c] for c in x])
c_vocab_size = len(d_c_vs_index.keys()) + 1
c_embedding_dim = math.floor(math.log2(c_vocab_size) + 1)
c_embedding_layer = nn.Embedding(c_vocab_size, c_embedding_dim) 
df_3f['cast_x'] = df_3f['cast_x'].apply(lambda x: [c_embedding_layer(torch.tensor(ic)).detach().numpy() for ic in x])
df_3f['cast_x'] = df_3f['cast_x'].apply(lambda x: np.mean(np.array(x), axis=0))

In [16]:
df_3f['genres_x'] = df_3f['genres_x'].apply(lambda x: str(x).split(','))
df_3f['genres_x'] = df_3f['genres_x'].apply(lambda x: [genre.strip() for genre in x])
col_g = list(df_3f['genres_x'])
d_g_vs_index = {}
ig=0
for gs in col_g:
    for g in gs:
        if g not in d_g_vs_index.keys():
            d_g_vs_index[g] = ig
            ig+=1
df_3f['genres_x'] = df_3f['genres_x'].apply(lambda x: [d_g_vs_index[g] for g in x])
g_vocab_size = len(d_g_vs_index.keys()) + 1
g_embedding_dim = math.floor(math.log2(g_vocab_size) + 1)
g_embedding_layer = nn.Embedding(g_vocab_size, g_embedding_dim) 
df_3f['genres_x'] = df_3f['genres_x'].apply(lambda x: [g_embedding_layer(torch.tensor(ig)).detach().numpy() for ig in x])
df_3f['genres_x'] = df_3f['genres_x'].apply(lambda x: np.mean(np.array(x), axis=0))

In [17]:
df_3f['week_of_year'] = df_3f['release_date'].apply(lambda x: datetime.strptime(x, '%m/%d/%y').isocalendar()[1])

In [18]:
print(w_vocab_size, w_embedding_dim)
print(d_vocab_size, d_embedding_dim)
print(p_vocab_size, p_embedding_dim)
print(c_vocab_size, c_embedding_dim)
print(g_vocab_size, g_embedding_dim)
# 2410 12
# 881 10
# 6027 13
# 71518 17
# 23 5

2410 12
881 10
6027 13
71518 17
23 5


In [19]:
df_3f.head(2)

Unnamed: 0,budget_y,writers,directors,producers,cast_x,genres_x,metascore,imdb user rating,release_date,revenue,week_of_year
0,17000000,"[-0.6893514, 0.027065942, -0.5796865, -0.94012...","[0.13440827, 2.6390803, -0.73129773, -1.417367...","[0.0672495, 0.23757625, -0.33949712, -0.151524...","[-0.18605737, -0.035827875, -0.02928893, 0.014...","[-0.06979808, -1.3382453, -0.98028475, 0.21630...",26,6,10/1/98,30331165,40
1,0,"[-1.2058403, 0.3268854, 0.7174303, -0.6224141,...","[-2.0156517, 0.59724534, -1.0167195, -0.865958...","[-0.14680491, 0.14712627, -1.1016364, -0.57326...","[0.15328898, 0.045781136, -0.018145978, 0.0145...","[0.0021236688, -0.15965456, -1.4836801, -0.585...",40,6,1/15/99,0,2


In [20]:
df_3f[[
    'w1','w2', 'w3','w4', 'w5','w6',
    'w7','w8', 'w9','w10', 'w11','w12'    
]] = pd.DataFrame(df_3f.writers.tolist(), index= df_3f.index)

In [21]:
df_3f[[
    'd1','d2', 'd3','d4', 'd5','d6',
    'd7','d8', 'd9','d10'   
]] = pd.DataFrame(df_3f.directors.tolist(), index= df_3f.index)

In [22]:
df_3f[[
    'p1','p2', 'p3','p4', 'p5','p6',
    'p7','p8', 'p9','p10', 'p11', 'p12', 'p13'   
]] = pd.DataFrame(df_3f.producers.tolist(), index= df_3f.index)

In [23]:
df_3f[[
    'c1','c2', 'c3','c4', 'c5','c6',
    'c7','c8', 'c9','c10', 'c11', 'c12', 'c13',
    'c14','c15', 'c16','c17'    
]] = pd.DataFrame(df_3f.cast_x.tolist(), index= df_3f.index)

In [24]:
df_3f[[
    'g1','g2', 'g3','g4', 'g5'
]] = pd.DataFrame(df_3f.genres_x.tolist(), index= df_3f.index)

In [25]:
df_3f['budget'] = df_3f['budget_y']
df_3f['imdb_user_rating'] = df_3f['imdb user rating']

In [26]:
df_3f.head(2)

Unnamed: 0,budget_y,writers,directors,producers,cast_x,genres_x,metascore,imdb user rating,release_date,revenue,...,c15,c16,c17,g1,g2,g3,g4,g5,budget,imdb_user_rating
0,17000000,"[-0.6893514, 0.027065942, -0.5796865, -0.94012...","[0.13440827, 2.6390803, -0.73129773, -1.417367...","[0.0672495, 0.23757625, -0.33949712, -0.151524...","[-0.18605737, -0.035827875, -0.02928893, 0.014...","[-0.06979808, -1.3382453, -0.98028475, 0.21630...",26,6,10/1/98,30331165,...,-0.057971,-0.341308,-0.0071,-0.069798,-1.338245,-0.980285,0.216301,-0.305167,17000000,6
1,0,"[-1.2058403, 0.3268854, 0.7174303, -0.6224141,...","[-2.0156517, 0.59724534, -1.0167195, -0.865958...","[-0.14680491, 0.14712627, -1.1016364, -0.57326...","[0.15328898, 0.045781136, -0.018145978, 0.0145...","[0.0021236688, -0.15965456, -1.4836801, -0.585...",40,6,1/15/99,0,...,-0.116217,0.069483,-0.020705,0.002124,-0.159655,-1.48368,-0.585685,-0.025198,0,6


In [27]:
df_4a = df_3f[[
    'budget', 
    'w1','w2', 'w3','w4', 'w5','w6',
    'w7','w8', 'w9','w10', 'w11','w12',    
    'd1','d2', 'd3','d4', 'd5','d6',
    'd7','d8', 'd9','d10',   
    'p1','p2', 'p3','p4', 'p5','p6',
    'p7','p8', 'p9','p10', 'p11', 'p12', 'p13',   
    'c1','c2', 'c3','c4', 'c5','c6',
    'c7','c8', 'c9','c10', 'c11', 'c12', 'c13',
    'c14','c15', 'c16','c17',    
    'g1','g2', 'g3','g4', 'g5',
    'metascore',
    'imdb_user_rating',
    'week_of_year',
    'revenue'
]]

In [28]:
df_4a.head(2)

Unnamed: 0,budget,w1,w2,w3,w4,w5,w6,w7,w8,w9,...,c17,g1,g2,g3,g4,g5,metascore,imdb_user_rating,week_of_year,revenue
0,17000000,-0.689351,0.027066,-0.579687,-0.940121,0.517654,-0.58329,-0.661125,0.390009,0.454081,...,-0.0071,-0.069798,-1.338245,-0.980285,0.216301,-0.305167,26,6,40,30331165
1,0,-1.20584,0.326885,0.71743,-0.622414,0.204414,0.10508,0.822529,-0.340266,0.436778,...,-0.020705,0.002124,-0.159655,-1.48368,-0.585685,-0.025198,40,6,2,0


In [29]:
df_4a.to_csv(dir_1 + 'movie_processed_1.csv', index=False)