In [34]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
df_credits = pd.read_csv("tmdb_5000_credits.csv", sep = ",")
df_movies = pd.read_csv("tmdb_5000_movies.csv", sep = ",")


In [37]:
df_imdb = pd.read_csv("imdb_data.csv")

In [38]:
df_imdb.columns
used_col = ['budget', 'genres', 'id', 'popularity', 'revenue', 'runtime', 'title', 'num_critic_for_reviews', 'duration'
            , 'actor_1_facebook_likes', 'director_facebook_likes', 'actor_3_facebook_likes', 'gross', 'cast_total_facebook_likes'
            , 'facenumber_in_poster', 'num_user_for_reviews', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', "movie_facebook_likes"]

In [39]:
df_imdb_nn = df_imdb.loc[:, used_col]

In [40]:

df_imdb_nn.set_index("id", inplace = True)

In [41]:
from ast import literal_eval

df_imdb_nn.loc[:, 'genres'] = df_imdb_nn.loc[:, 'genres'].apply(literal_eval)


In [42]:
df_imdb_nn.loc[:, 'genres'] = df_imdb_nn.loc[:, 'genres'].apply(get_list)

In [43]:
df_imdb_nn.loc[:, "genres"] = df_imdb_nn.loc[:, 'genres'].apply(clean_data)

In [44]:
df_imdb_nn.loc[:, 'genres_str'] = df_imdb_nn.loc[:, 'genres'].apply(lambda x : " ".join(x))

In [45]:

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(df_imdb_nn['genres_str'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [46]:
df_genres = pd.DataFrame(index = df_imdb_nn.index, columns=cv.get_feature_names())

In [47]:
df_genres_t = df_genres.T

In [48]:
for i in df_genres.index:
    df_genres_t.loc[:, i] = cv.transform([df_imdb_nn["genres_str"][i]]).toarray()[0]

In [49]:
df_genres = df_genres_t.T

In [50]:
df_imdb_nn.columns

Index(['budget', 'genres', 'popularity', 'revenue', 'runtime', 'title',
       'num_critic_for_reviews', 'duration', 'actor_1_facebook_likes',
       'director_facebook_likes', 'actor_3_facebook_likes', 'gross',
       'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes', 'genres_str'],
      dtype='object')

In [51]:
nn_feature = ['budget', 'popularity', 'revenue', 'num_critic_for_reviews', 'duration', 'actor_1_facebook_likes',
       'director_facebook_likes', 'actor_3_facebook_likes', 'gross', 'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'actor_2_facebook_likes', 'aspect_ratio', 'movie_facebook_likes']
X = df_imdb_nn[nn_feature]
X = X.merge(df_genres, left_index = True, right_index = True, how = "left")

y = df_imdb_nn['imdb_score']

In [52]:
score_dict = {}
count = 1
for i in np.arange(0.1, 10.1, step = 0.1):
    i = round(i, 1)
    score_dict[str(i)] = count
    count += 1

y = y.apply(lambda x : score_dict[str(x)])

In [53]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

X_std_scale = std_scaler.fit_transform(X)

In [54]:


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

X_std_train, X_std_test, y_std_train, y_std_test = train_test_split(X_std_scale, y, test_size = 0.2, random_state = 42)
X_std_train, X_std_val, y_std_train, y_std_val = train_test_split(X_std_train, y_std_train, test_size = 0.2, random_state = 42)

In [55]:
import keras
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train, 100)
y_val = np_utils.to_categorical(y_val, 100)
y_test = np_utils.to_categorical(y_test, 100)

y_std_train = np_utils.to_categorical(y_std_train, 100)
y_std_val = np_utils.to_categorical(y_std_val, 100)
y_std_test = np_utils.to_categorical(y_std_test, 100)

# Model with Sparse AE pre-training on Standarized Data

In [56]:
K = keras.backend
kl_divergence = keras.losses.kullback_leibler_divergence

class KLDivergenceRegularizer(keras.regularizers.Regularizer):
    def __init__(self, weight, target=0.1):
        self.weight = weight
        self.target = target
    def __call__(self, inputs):
        mean_activities = K.mean(inputs, axis=0)
        return self.weight * (
            kl_divergence(self.target, mean_activities) +
            kl_divergence(1. - self.target, 1. - mean_activities))

In [57]:
from tensorflow.keras.models import Model
from tensorflow import keras
import tensorflow as tf
tf.random.set_seed(42)
np.random.seed(42)

kld_reg = KLDivergenceRegularizer(weight=0.05, target=0.1)
sparse_l1_encoder = keras.models.Sequential([
    keras.layers.Dense(200, input_dim = 35, activation = "relu"),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(15, activation="relu", activity_regularizer=kld_reg)  # Alternatively, you could add
                                                  # activity_regularizer=keras.regularizers.l1(1e-3)
                                                  # to the previous layer.
])
sparse_l1_decoder = keras.models.Sequential([
    keras.layers.Dense(100, activation="relu", input_shape=[15]),
    keras.layers.Dense(200, activation="relu"),
    keras.layers.Dense(35, activation="relu")
])
sparse_l1_ae = keras.models.Sequential([sparse_l1_encoder, sparse_l1_decoder])
sparse_l1_ae.compile(loss="mse", optimizer="adam")

history = sparse_l1_ae.fit(X_std_train, X_std_train, batch_size = 32, epochs=64,
                           validation_data=[X_std_val, X_std_val])


Train on 2843 samples, validate on 711 samples
Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [58]:
len(y_std_val)

711

In [59]:
dnn_clf = keras.models.Sequential([
    keras.layers.Dense(10, input_dim=15, activation = "relu"),
    keras.layers.Dense(64, activation = "relu"),
    keras.layers.Dense(100, activation = "softmax")
])

ae_dnn_clf = keras.models.Sequential([sparse_l1_encoder, dnn_clf])
ae_dnn_clf.compile(loss = "categorical_crossentropy", optimizer = "Adam", metrics = ["accuracy"])

ae_dnn_clf.fit(X_std_train, y_std_train, batch_size = 32, epochs = 32,
               validation_data = [X_std_val, y_std_val])

Train on 2843 samples, validate on 711 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x115f5bd68>

In [27]:
y_pred = ae_dnn_clf.predict(X_std_test)
score = ae_dnn_clf.evaluate(X_std_test, y_std_test)

df_testing_result = pd.DataFrame(index = range(len(y_pred)), columns = ["Predict", "Actual"])
for i in range(len(y_pred)):
    df_testing_result.loc[i, "Predict"] = np.argmax(y_pred[i])
    df_testing_result.loc[i, "Actual"] = np.argmax(y_test[i])
    
df_testing_result.loc[:, "Error between +1.0 ~ -1.0"] = df_testing_result.apply(lambda x : "True" if abs(x[0] - x[1]) <= 10 else "false", axis = 1)
df_testing_result.loc[:, "Error between +1.5 ~ -1.5"] = df_testing_result.apply(lambda x : "True" if abs(x[0] - x[1]) <= 15 else "false", axis = 1)



In [33]:
score_interval = len(df_testing_result[df_testing_result['Error between +1.0 ~ -1.0'] == 'True']) / len(df_testing_result)
score_interval2 = len(df_testing_result[df_testing_result['Error between +1.5 ~ -1.5'] == 'True']) / len(df_testing_result)
print(f'accuracy : {score[1]}')
print(f'accuracy between +1.0 ~ -1.0 : {score_interval}')
print(f'accuracy between +1.5 ~ -1.5 : {score_interval2}')

accuracy : 0.04049493744969368
accuracy between +1.0 ~ -1.0 : 0.7919010123734533
accuracy between +1.5 ~ -1.5 : 0.906636670416198


# Model on Original Data

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras import losses

model = Sequential()
model.add(Dense(20, input_dim=35, activation = "relu"))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])

In [30]:
model.fit(X_train, y_train, epochs=32, batch_size=100, validation_data=(X_val, y_val))

Train on 2843 samples, validate on 711 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x13a433e10>

# Model on Standarized Data

In [60]:
model_std = Sequential()
model_std.add(Dense(20, input_dim=35, activation = "relu"))
model_std.add(Dense(64, activation='relu'))
model_std.add(Dense(128, activation='relu'))
model_std.add(Dense(64, activation='relu'))
model_std.add(Dense(100, activation="softmax"))
model_std.compile(loss="categorical_crossentropy", optimizer='Adam', metrics = ["accuracy"])

In [61]:
model_std.fit(X_std_train, y_std_train, epochs=32, batch_size=100, validation_data=(X_std_val, y_std_val))

Train on 2843 samples, validate on 711 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x139a5dbe0>

In [62]:
y_pred_std = model_std.predict(X_std_test)
score_std = model_std.evaluate(X_std_test, y_std_test)

df_result_std = pd.DataFrame(index = range(len(y_pred)), columns = ["Predict", "Actual"])
for i in range(len(y_pred)):
    df_result_std.loc[i, "Predict"] = np.argmax(y_pred_std[i])
    df_result_std.loc[i, "Actual"] = np.argmax(y_test[i])
    
df_result_std.loc[:, "Error between +1.0 ~ -1.0"] = df_result_std.apply(lambda x : "True" if abs(x[0] - x[1]) <= 10 else "false", axis = 1)    
df_result_std.loc[:, "Error between +1.5 ~ -1.5"] = df_result_std.apply(lambda x : "True" if abs(x[0] - x[1]) <= 15 else "false", axis = 1)



In [63]:
score_interval_std = len(df_testing_result[df_testing_result['Error between +1.0 ~ -1.0'] == 'True']) / len(df_testing_result)
score_interval_std2 = len(df_testing_result[df_testing_result['Error between +1.5 ~ -1.5'] == 'True']) / len(df_testing_result)

print(f'accuracy : {score_std[1]}')
print(f'accuracy between +1.0 ~ -1.0 : {score_interval_std}')
print(f'accuracy between +1.5 ~ -1.5 : {score_interval_std2}')

accuracy : 0.05286839231848717
accuracy between +1.0 ~ -1.0 : 0.7919010123734533
accuracy between +1.5 ~ -1.5 : 0.906636670416198
