In [664]:
from pathlib import Path
import pandas as pd
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import os
import psycopg2
from flask import Flask, render_template
import pandas as pd

#import sqlkey
from config import sql_pass

In [665]:
conn = psycopg2.connect(database="flask_df", user="flask_df_user", password=sql_pass, host="dpg-cgtiqjl269vbmeuj26cg-a.oregon-postgres.render.com", port=5432)

# Open a cursor to perform database operations
cur = conn.cursor()

In [638]:
# create query and create dataframe from query
cur.execute('SELECT * FROM movie_successs;')
books = cur.fetchall()
col_name = ['name', 'rating', 'genre', 'year', 'released', 'score', 'votes', 'director', 'writer', 'star', 'country', 'budget', 'gross', 'company', 'runtime', 'cpi', 'ticket']
df_movie = pd.DataFrame(books, columns = col_name)

In [639]:
#Split release data and create a monthly column

df_movie["released"] = df_movie["released"].str.split("\s+\(").str[0]

df_movie["released"] = pd.to_datetime(df_movie['released'])

df_movie ["released"] = df_movie["released"].dt.month_name()

In [640]:
#assign datatypes

df_movie = df_movie[pd.to_numeric(df_movie['score'], errors='coerce').notnull()]
df_movie = df_movie[pd.to_numeric(df_movie['budget'], errors='coerce').notnull()]
df_movie = df_movie[pd.to_numeric(df_movie['gross'], errors='coerce').notnull()]
df_movie = df_movie[pd.to_numeric(df_movie['votes'], errors='coerce').notnull()]
df_movie['cpi'] = df_movie['cpi'].astype('float')
df_movie['budget'] = df_movie['budget'].astype('float')
df_movie['ticket'] = df_movie['ticket'].astype('float')
df_movie['ticket'] = df_movie['ticket'].astype('int')
df_movie['score'] = df_movie['score'].astype('float')

df_movie.dtypes


name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
cpi         float64
ticket        int32
dtype: object

In [641]:
# add column for character count of movie title

df_movie['title_char'] = df_movie['name'].str.len()

In [642]:
# look for sequel indicator and assign value of 1 or 0

df_movie['sequel'] = df_movie['name'].str.extract('(^\d*)')

df_movie['sequels']=df_movie['sequel'].isin([""]).astype(int)

df_movie = df_movie.drop(columns=['sequel'])

In [643]:
# added averaged score values for director, writer and star

df_movie['director_number'] = df_movie.groupby('director')['score'].transform('sum')/df_movie.groupby('director')['score'].transform('count')
df_movie['writer_number'] = df_movie.groupby('writer')['score'].transform('sum')/df_movie.groupby('writer')['score'].transform('count')
df_movie['star_number'] = df_movie.groupby('star')['score'].transform('sum')/df_movie.groupby('star')['score'].transform('count')

df_movie['director_number'] = df_movie['director_number'].astype('float')
df_movie['writer_number'] = df_movie['writer_number'].astype('float')
df_movie['star_number'] = df_movie['star_number'].astype('float')

In [644]:
# assign 1 and 0 for success or failure
df_movie['trixbudget']= df_movie['gross'] - 3.0*df_movie['budget']
df_movie['success'] = df_movie['trixbudget'].map(lambda x: x>0).astype(int)

In [645]:
# drop unnecessary values

df_movie_clean = df_movie.drop(columns=['name', 'trixbudget', 'gross', 'votes'])

df_movie_clean = df_movie_clean[df_movie_clean.budget > 15000000]

df_movie_clean = df_movie_clean[df_movie_clean.year > 1985]

df_movie_clean = df_movie_clean.dropna()

In [646]:
# assign values as other for counts under certain values

director_other = df_movie_clean['director'].value_counts() < 11 #15
df_movie_clean.loc[df_movie_clean['director'].isin(director_other.index[director_other]), 'director'] = 'others'

writer_other = df_movie_clean['writer'].value_counts() < 8
df_movie_clean.loc[df_movie_clean['writer'].isin(writer_other.index[writer_other]), 'writer'] = 'others'

star_other = df_movie_clean['star'].value_counts() < 12
df_movie_clean.loc[df_movie_clean['star'].isin(star_other.index[star_other]), 'star'] = 'others'

country_other = df_movie_clean['country'].value_counts() < 25
df_movie_clean.loc[df_movie_clean['country'].isin(country_other.index[country_other]), 'country'] = 'others'

company_other = df_movie_clean['company'].value_counts() < 100
df_movie_clean.loc[df_movie_clean['company'].isin(company_other.index[company_other]), 'company'] = 'others'

rating_other = df_movie_clean['rating'].value_counts() < 100
df_movie_clean.loc[df_movie_clean['rating'].isin(rating_other.index[rating_other]), 'rating'] = 'others'

genre_other = df_movie_clean['genre'].value_counts() < 20
df_movie_clean.loc[df_movie_clean['genre'].isin(genre_other.index[genre_other]), 'rating'] = 'others'

In [649]:
#cpi adjustments:
df_movie_clean['budget'] = df_movie_clean['budget']/df_movie_clean['cpi']
df_movie_clean['ticket'] = df_movie_clean['ticket']/df_movie_clean['cpi']

In [650]:
df_movie_dummies = pd.get_dummies(df_movie_clean)

In [651]:
# create dummy for non-integer variables
df_movie_dummies = pd.get_dummies(df_movie_clean)

In [652]:
#split x and y values and create train test split
y = df_movie_dummies["success"]

X = df_movie_dummies.drop(columns=['success'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [653]:
# scale x variables

scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

In [659]:
# create model

nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=20, activation="sigmoid", input_dim=152))
nn_model.add(tf.keras.layers.Dense(units=4, activation="sigmoid"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn_model.fit(X_train_scaled, y_train, epochs=13)


model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13
26/26 - 0s - loss: 0.5231 - accuracy: 0.7559 - 167ms/epoch - 6ms/step
Loss: 0.5230792164802551, Accuracy: 0.7559153437614441
