In [1]:
from pathlib import Path
import pandas as pd
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [61]:
# import csv
file_path = Path("archive/movies.csv")
df_movie = pd.read_csv(file_path)

In [62]:
#Split release data and create a monthly column

df_movie["released"] = df_movie["released"].str.split("\s+\(").str[0]

df_movie["released"] = pd.to_datetime(df_movie['released'])

df_movie ["released"] = df_movie["released"].dt.month_name()

In [63]:
# add column for character count of movie title

df_movie['title_char'] = df_movie['name'].str.len()

In [64]:
# look for sequel indicator and assign value of 1 or 0

df_movie['sequel'] = df_movie['name'].str.extract('(^\d*)')

df_movie['sequels']=df_movie['sequel'].isin([""]).astype(int)

df_movie = df_movie.drop(columns=['sequel'])

In [65]:
# added averaged score values for director, writer and star

df_movie['director_number'] = df_movie.groupby('director')['score'].transform('sum')/df_movie.groupby('director')['score'].transform('count')
df_movie['writer_number'] = df_movie.groupby('writer')['score'].transform('sum')/df_movie.groupby('writer')['score'].transform('count')
df_movie['star_number'] = df_movie.groupby('star')['score'].transform('sum')/df_movie.groupby('star')['score'].transform('count')

In [66]:
# assign 1 and 0 for success or failure
df_movie['trixbudget']= df_movie['gross'] - 3.0*df_movie['budget']
df_movie['success'] = df_movie['trixbudget'].map(lambda x: x>0).astype(int)

In [67]:
# drop unnecessary values

df_movie_clean = df_movie.drop(columns=['name', 'trixbudget', 'gross', 'score', 'votes'])

df_movie_clean = df_movie_clean[df_movie_clean.budget > 15000000]

df_movie_clean = df_movie_clean[df_movie_clean.year > 1985]

df_movie_clean = df_movie_clean.dropna()

In [68]:
# assign values as other for counts under certain values

director_other = df_movie_clean['director'].value_counts() < 15
df_movie_clean.loc[df_movie_clean['director'].isin(director_other.index[director_other]), 'director'] = 'others'

writer_other = df_movie_clean['writer'].value_counts() < 10
df_movie_clean.loc[df_movie_clean['writer'].isin(writer_other.index[writer_other]), 'writer'] = 'others'

star_other = df_movie_clean['star'].value_counts() < 11
df_movie_clean.loc[df_movie_clean['star'].isin(star_other.index[star_other]), 'star'] = 'others'

country_other = df_movie_clean['country'].value_counts() < 25
df_movie_clean.loc[df_movie_clean['country'].isin(country_other.index[country_other]), 'country'] = 'others'

company_other = df_movie_clean['company'].value_counts() < 100
df_movie_clean.loc[df_movie_clean['company'].isin(company_other.index[company_other]), 'company'] = 'others'

rating_other = df_movie_clean['rating'].value_counts() < 100
df_movie_clean.loc[df_movie_clean['rating'].isin(rating_other.index[rating_other]), 'rating'] = 'others'

genre_other = df_movie_clean['genre'].value_counts() < 20
df_movie_clean.loc[df_movie_clean['genre'].isin(genre_other.index[genre_other]), 'rating'] = 'others'

In [69]:
#cpi adjustments:
df_movie_clean['budget'] = df_movie_clean['budget']/df_movie_clean['cpi']
df_movie_clean['ticket'] = df_movie_clean['ticket']/df_movie_clean['cpi']

In [70]:
# create dummy for non-integer variables
df_movie_dummies = pd.get_dummies(df_movie_clean)

In [71]:
#split x and y values and create train test split
y = df_movie_dummies["success"]

X = df_movie_dummies.drop(columns=['success'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [72]:
# scale x variables

scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

In [84]:
# create model

nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=20, activation="sigmoid", input_dim=133))
nn_model.add(tf.keras.layers.Dense(units=8, activation="sigmoid"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn_model.fit(X_train_scaled, y_train, epochs=11)


model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
26/26 - 0s - loss: 0.5468 - accuracy: 0.7500 - 156ms/epoch - 6ms/step
Loss: 0.54683518409729, Accuracy: 0.75


In [85]:
df_movie_clean

Unnamed: 0,rating,genre,year,released,director,writer,star,country,budget,company,runtime,cpi,ticket,title_char,sequels,director_number,writer_number,star_number,success
844,R,Action,1986,July,others,others,others,United Kingdom,4.368297e+07,Twentieth Century Fox,137.0,0.423506,8.382409,6,1,7.871429,7.800,6.618182,1
848,PG,Adventure,1986,June,others,others,others,United Kingdom,5.903105e+07,others,101.0,0.423506,8.382409,9,1,7.266667,7.400,7.350000,0
849,R,Action,1986,March,others,others,others,United Kingdom,3.777987e+07,others,116.0,0.423506,8.382409,10,1,5.900000,5.600,5.690909,0
850,PG-13,Action,1986,July,others,others,Kurt Russell,United States,5.903105e+07,Twentieth Century Fox,99.0,0.423506,8.382409,27,1,6.621429,6.750,6.710000,0
855,PG,Action,1986,August,others,others,others,United States,8.736595e+07,Universal Pictures,110.0,0.423506,8.382409,15,1,4.250000,4.350,4.850000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7648,R,Action,2020,January,others,others,Will Smith,United States,9.000000e+07,Columbia Pictures,124.0,1.000000,9.160000,17,1,6.600000,6.825,6.708696,1
7649,PG,Action,2020,February,others,others,others,United States,8.500000e+07,Paramount Pictures,99.0,1.000000,9.160000,18,1,6.500000,5.550,6.500000,1
7650,PG,Adventure,2020,January,others,others,Robert Downey Jr.,United States,1.750000e+08,Universal Pictures,101.0,1.000000,9.160000,8,1,6.025000,5.800,7.004762,0
7651,PG,Adventure,2020,February,others,others,Harrison Ford,Canada,1.350000e+08,others,100.0,1.000000,9.160000,20,1,6.800000,6.650,6.803846,0
