In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json

In [2]:
# read the file
df = pd.read_csv('PMA_blockbuster_movies.csv')
df.head()

Unnamed: 0,poster_url,rt_audience_score,rt_freshness,2015_inflation,adjusted,genres,Genre_1,Genre_2,Genre_3,imdb_rating,length,rating,release_date,studio,title,worldwide_gross,year
0,http://resizing.flixster.com/gxRJwetP1eNIrPR6x...,4.3,89.0,-0.26%,"$712,903,691.09",Sci-Fi\nAdventure\nAction,Sci-Fi,Adventure,Action,7.8,136.0,PG-13,04-Apr-14,Marvel Studios,Captain America: The Winter Soldier,"$714,766,572.00",2014.0
1,http://resizing.flixster.com/gDtbA1iPxTYEjBZeS...,4.2,90.0,-0.26%,"$706,988,165.89",Sci-Fi\nDrama\nAction,Sci-Fi,Drama,Action,7.7,130.0,PG-13,11-Jul-14,20th Century Fox,Dawn of the Planet of the Apes,"$708,835,589.00",2014.0
2,http://resizing.flixster.com/YrF_OeTQx3bXNsMLI...,4.4,91.0,-0.26%,"$772,158,880.00",Sci-Fi\nAdventure\nAction,Sci-Fi,Adventure,Action,8.1,121.0,PG-13,01-Aug-14,Marvel Studios,Guardians of the Galaxy,"$774,176,600.00",2014.0
3,http://resizing.flixster.com/l9yjA-72sZMYECeOj...,4.2,72.0,-0.26%,"$671,220,455.10",Sci-Fi\nAdventure,Sci-Fi,Adventure,,8.7,169.0,PG-13,07-Nov-14,Paramount Pictures / Warner Bros.,Interstellar,"$672,974,414.00",2014.0
4,http://resizing.flixster.com/YukULOFULUesVZccN...,3.8,49.0,-0.26%,"$756,677,675.77",Family\nAdventure\nAction,Family,Adventure,Action,7.1,97.0,PG,30-May-14,Walt Disney Pictures,Maleficent,"$758,654,942.00",2014.0


In [3]:
# droping the features which seems no influence on prediction
""" 'poster_url' is not influensing to prediction, unless people are not captured by adds and word of mouth
    
    'genres','Genre_1','Genre_2','Genre_3' are worthwhile features for prediction, however, I dont know how to transform 
     them into the numbers. through 'dummies' it is going to be many features and since the same genre occur in 
     'Genre_1','Genre_2','Genre_3' it seem to be repeatitive. 
     
     'release_date','year' - is also not useful for prediction, since doesn't make sence.
     
     'title' - at some point make sense, but not much.
     
     'worldwide_gross'- since it is one of the target variable and it is similar to 'Adjusted' column it shouldnt be
      in the set of X variables.
"""

df = df.drop(columns=['studio','poster_url','genres','Genre_1','Genre_2',
                      'Genre_3','release_date','title','worldwide_gross','year'])

In [4]:
df.head()

Unnamed: 0,rt_audience_score,rt_freshness,2015_inflation,adjusted,imdb_rating,length,rating
0,4.3,89.0,-0.26%,"$712,903,691.09",7.8,136.0,PG-13
1,4.2,90.0,-0.26%,"$706,988,165.89",7.7,130.0,PG-13
2,4.4,91.0,-0.26%,"$772,158,880.00",8.1,121.0,PG-13
3,4.2,72.0,-0.26%,"$671,220,455.10",8.7,169.0,PG-13
4,3.8,49.0,-0.26%,"$756,677,675.77",7.1,97.0,PG


In [5]:
# to split the values of Rating by types
df = pd.get_dummies(df, columns=["rating"], prefix=["rating_type"])

In [6]:
df.head()

Unnamed: 0,rt_audience_score,rt_freshness,2015_inflation,adjusted,imdb_rating,length,rating_type_G,rating_type_PG,rating_type_PG-13,rating_type_R
0,4.3,89.0,-0.26%,"$712,903,691.09",7.8,136.0,0,0,1,0
1,4.2,90.0,-0.26%,"$706,988,165.89",7.7,130.0,0,0,1,0
2,4.4,91.0,-0.26%,"$772,158,880.00",8.1,121.0,0,0,1,0
3,4.2,72.0,-0.26%,"$671,220,455.10",8.7,169.0,0,0,1,0
4,3.8,49.0,-0.26%,"$756,677,675.77",7.1,97.0,0,1,0,0


In [8]:
# drop the % character from cloumn '2015_inflation'
df['2015_inflation'] = df['2015_inflation'].str[:-1].astype(float)


In [9]:
df.head()

Unnamed: 0,rt_audience_score,rt_freshness,2015_inflation,adjusted,imdb_rating,length,rating_type_G,rating_type_PG,rating_type_PG-13,rating_type_R
0,4.3,89.0,-0.26,"$712,903,691.09",7.8,136.0,0,0,1,0
1,4.2,90.0,-0.26,"$706,988,165.89",7.7,130.0,0,0,1,0
2,4.4,91.0,-0.26,"$772,158,880.00",8.1,121.0,0,0,1,0
3,4.2,72.0,-0.26,"$671,220,455.10",8.7,169.0,0,0,1,0
4,3.8,49.0,-0.26,"$756,677,675.77",7.1,97.0,0,1,0,0


In [10]:
# drop the $ character from column 'adjusted'
df['adjusted'] = df['adjusted'].str[1:].astype(str)
# drop the comma',' from column 'adjusted' and change the type of column to Float
df['adjusted'] = df['adjusted'].str.replace('\,','')
df['adjusted'] = df['adjusted'].astype(float)

In [11]:
df.head()

Unnamed: 0,rt_audience_score,rt_freshness,2015_inflation,adjusted,imdb_rating,length,rating_type_G,rating_type_PG,rating_type_PG-13,rating_type_R
0,4.3,89.0,-0.26,712903700.0,7.8,136.0,0,0,1,0
1,4.2,90.0,-0.26,706988200.0,7.7,130.0,0,0,1,0
2,4.4,91.0,-0.26,772158900.0,8.1,121.0,0,0,1,0
3,4.2,72.0,-0.26,671220500.0,8.7,169.0,0,0,1,0
4,3.8,49.0,-0.26,756677700.0,7.1,97.0,0,1,0,0


In [12]:
# normalising data with minmax scaling method, scaling the featue between 0 and 1

df['adjusted']=(df['adjusted']-df['adjusted'].min())/(df['adjusted'].max()-df['adjusted'].min())


In [13]:
df.head()

Unnamed: 0,rt_audience_score,rt_freshness,2015_inflation,adjusted,imdb_rating,length,rating_type_G,rating_type_PG,rating_type_PG-13,rating_type_R
0,4.3,89.0,-0.26,0.206771,7.8,136.0,0,0,1,0
1,4.2,90.0,-0.26,0.204743,7.7,130.0,0,0,1,0
2,4.4,91.0,-0.26,0.227095,8.1,121.0,0,0,1,0
3,4.2,72.0,-0.26,0.192475,8.7,169.0,0,0,1,0
4,3.8,49.0,-0.26,0.221785,7.1,97.0,0,1,0,0


In [14]:
df = df.dropna()

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df.loc[:, df.columns != 'adjusted'], df['adjusted'], 
		test_size = 0.2, random_state=5)  # X is “1:” and Y is “[0]”

# print the shapes to check everything is OK
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(318, 9)
(80, 9)
(318,)
(80,)


In [16]:
#learning the model by default parameters 

from sklearn.tree import DecisionTreeRegressor as DTC

# a decision tree model with default values
dtc = DTC()

# fit the model using some training data
dtc_fit = dtc.fit(X_train, Y_train)

# generate a mean accuracy score for the predicted data
train_score = dtc.score(X_train, Y_train)

# print the mean accuracy of testing predictions
print("Accuracy score = " + str(round(train_score, 4)))


#################################################################
#####                                                       #####
#####  TESTING PART - ONLY RUN WHEN THE MODEL IS TUNED!!    #####
#####                                                       #####
#################################################################

# predict the test data
predicted = dtc.predict(X_test)

# generate a mean accuracy score for the predicted data
test_score = dtc.score(X_test, Y_test)

# print the mean accuracy of testing predictions
print("Accuracy score = " + str(round(test_score, 4)))

Accuracy score = 1.0
Accuracy score = 0.3109


In [17]:
# tuning the model with parameters

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor as DTC

tuned_parameters = [{'criterion': ['mse', 'friedman_mse','mae'],
                     'max_depth': [3, 5, 7],
                     'min_samples_split': [3, 5, 7],
                     'max_features': ["sqrt", "log2", None]}]

scores = ['r2', 'neg_mean_squared_error']

for score in scores:
    print("# Tuning hyperparameters for %s" % score)
    print("\n")
    clf = GridSearchCV(DTC(), tuned_parameters, cv=5,
                       scoring= score)
    clf.fit(X_train, Y_train)
    print("Best parameters set found on the training set:")
    print(clf.best_params_)
    print("\n")

# Tuning hyperparameters for r2






Best parameters set found on the training set:
{'criterion': 'mae', 'max_depth': 7, 'max_features': 'log2', 'min_samples_split': 5}


# Tuning hyperparameters for neg_mean_squared_error


Best parameters set found on the training set:
{'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 7}




In [28]:

from sklearn.tree import DecisionTreeRegressor as DTC

# a decision tree model with default values
dtc = DTC(criterion = 'mae', max_depth = 7, max_features = 'log2', min_samples_split = 5, random_state = 5)

# fit the model using some training data
dtc_fit = dtc.fit(X_train, Y_train)

# generate a mean accuracy score for the predicted data
train_score = dtc.score(X_train, Y_train)

# print the mean accuracy of testing predictions
print("score = " + str(round(train_score, 4)))

score = 0.6394


In [29]:
# predict the test data
predicted = dtc.predict(X_test)

# generate a mean accuracy score for the predicted data
test_score = dtc.score(X_test, Y_test)

# print the mean accuracy of testing predictions
print("score = " + str(round(test_score, 4)))

score = 0.3375
