In [0]:
import numpy as np
import pandas as pd
from datetime import datetime

In [0]:
import pandas as pd
import json
import operator

def is_json(myjson):
    try:
        json_object = json.loads(myjson)
    except:
        return False
    return True

#función auxiliar json_to unpack for ez compresion
def encode_json_column(pandas_data_frame, json_column_index=0, json_id_column="id", encodinglimit = 1000, remove_non_encoded = 1):
      
    X = pandas_data_frame.iloc[:, :].values

    #create a list of codes you want to take, based on encodinglimit
    all_encodedcolumns = {}
    
    for row in X:                    
        if(is_json(row[json_column_index])): #some data is just not json. ignore            
            #for each feature in the json
            for json_features in json.loads(row[json_column_index]):
                #pick out its id (the json identifier you specifc in json_id_column)
                featureid = json_features[json_id_column]                
                #if this id hasn't been seen yet, add it to the dataframe with default 0
                if featureid not in all_encodedcolumns:
                    all_encodedcolumns[featureid] = 1                   
                #else just set it to 1 here
                all_encodedcolumns[featureid] += 1

    top_encodedcolumns = sorted(all_encodedcolumns.items(), key=operator.itemgetter(1), reverse=True)
    
    if encodinglimit < len(top_encodedcolumns):
        top_encodedcolumns = top_encodedcolumns[:encodinglimit]        

    top_encodedcolumns = dict(top_encodedcolumns)
    
    #keep track of whether a column has been encoded into the dataframe already, else we'd reset all the values to 0
    df_encodedcolumns = []
    count = 0
    
    #for each row in the data
    for row in X:
        
        #keep track of whether this row can be kept or not, based on if it has an encoded value
        has_an_encoded_value = 0
        
        if(is_json(row[json_column_index])): #some data is just not json. ignore
            
            #for each feature in the json
            for json_features in json.loads(row[json_column_index]):
                
                #pick out its id (the json identifier you specifc in json_id_column)
                featureid = json_features[json_id_column]
                                
                if featureid in top_encodedcolumns:

                    #if this id hasn't been seen yet, add it to the dataframe with default 0
                    if featureid not in df_encodedcolumns:
                        df_encodedcolumns.append(featureid)
                        pandas_data_frame[featureid]=0

                    pandas_data_frame.loc[count,featureid] = 1
                    
                    has_an_encoded_value = 1
    
        if has_an_encoded_value == 0 & remove_non_encoded == 1:
            pandas_data_frame.drop(pandas_data_frame.index[count])
        else:
          count+=1

    #drop the original json column
    pandas_data_frame = pandas_data_frame.drop(pandas_data_frame.columns[json_column_index], 1)
    
    return pandas_data_frame

In [10]:
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_credits.csv to tmdb_5000_credits.csv
Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv


In [0]:
import io
# Importing the dataset
dataset  = pd.read_csv(io.BytesIO(uploaded['tmdb_5000_movies.csv']))
dataset_credits  = pd.read_csv(io.BytesIO(uploaded["tmdb_5000_credits.csv"]))
dataset = pd.concat([dataset, dataset_credits], axis=1)


In [22]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
budget                  4803 non-null int64
genres                  4803 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4803 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null float64
vote_count              4803 non-null 

In [0]:
# meaning out 0 budgets - there are a lot, so this is better than removing the rows
dataset['budget']=dataset['budget'].replace(0,dataset['budget'].mean())

X = dataset.iloc[:, :].values
y_revenue = dataset.iloc[:, 12].values
y_rating = dataset.iloc[:, 18].values


# picking independent variables
X = X[:,[0,1,4,9,11,13,14,15,22,23]]

# Removing zero REVENUES from the data - revenue is super important
# I could (and have) adjusted for inflation, but it made scant difference to model performance
y_revenue_removed = []
y_rating_removed = []
X_removed = []
for l in range(0,len(y_revenue)):
    if y_revenue[l] !=0:
        y_revenue_removed.append(y_revenue[l])
        y_rating_removed.append(y_rating[l])
        X_removed.append(X[l])
y_revenue = np.array(y_revenue_removed)
y_rating = np.array(y_rating_removed)
X = np.array(X_removed)

# Ajusting inflation to 2019 at average inflation - 3.22%
# do this only if using revenue (12 y index)
avg_inflation = 1.01322
year_now = 2019
for l in range(0,len(y_revenue)):
    try:
        film_year = int(X[l,4][0:4])
        y_revenue[l] = y_revenue[l]*(avg_inflation ** (year_now-film_year))
        X[l,7] = int(film_year)
    except:
        X[l,4] = 0

# converting film date to day of year
# i am arguably losing the 'year' which might be slightly correlated with film success
# but that opens up a whole new can of worms about ratings and revenues by year
for l in range(0,len(y_revenue)):
    film_date = X[l,4]
    try:
        datetime_object = datetime.strptime(film_date, '%Y-%m-%d')
        X[l,4] = datetime_object.timetuple().tm_yday
    except:
        X[l,4] = 0

dataset =  pd.DataFrame(X)

# encoding genres. 
# using name because "id" overlaps with "id" in the next encoding, and so on
dataset = encode_json_column(dataset, 1,"name")

# encoding keywords
# limiting to 100 codes, and removing anything not within those 100
# yes, it is column 1 now, since last column 1 was removed by previous encoding
dataset = encode_json_column(dataset, 1, "name", 100, 1) #was 100

# encoding production companies.
# limiting to 100 codes, and removing anything not within those 100
dataset = encode_json_column(dataset, 1,"name", 100, 1) #was 100

# encoding all spoken languages
dataset = encode_json_column(dataset, 3,"iso_639_1")

# encoding cast
# encoding 'just' top 500 cast
dataset = encode_json_column(dataset, 4,"name", 500, 1) #was 500

# encoding crew
# encoding 'just' top 500 cast
dataset = encode_json_column(dataset, 4,"name", 500, 1) #was 500

In [0]:
#saving to CSVs as a checkpoint to be used in regressors
from google.colab import files
dataset.to_csv('Encoded_X.csv')
files.download("Encoded_X.csv")
dataset_y_revenue = pd.DataFrame(y_revenue)
dataset_y_revenue.to_csv(r'Encoded_y - revenue.csv')
files.download("Encoded_y - revenue.csv")
dataset_y_rating = pd.DataFrame(y_rating)
dataset_y_rating.to_csv(r'Encoded_y - rating.csv')
files.download("Encoded_y - rating.csv")