In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date
from sklearn.decomposition import PCA
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# "Sequential" model lets us to define a stack of neural network layers
from keras.models import Sequential
# import the "core" layers:
from keras.layers import Dense, Dropout, Activation, Flatten
# CNN
from keras.layers import Convolution2D, MaxPooling2D
# import some utilities to transform our data
from keras.utils import np_utils
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

Using TensorFlow backend.


## Importing both TESTING AND TRAINING IMDB DATA

In [2]:
pd.set_option('display.float_format', '{:,.2f}'.format)

IMDB_MOVIE_TEST_DATA = pd.read_csv('test.csv')
IMDB_MOVIE_TRAIN_DATA = pd.read_csv('train.csv')

## DISPLAYING TESTING DATA

In [3]:
IMDB_MOVIE_TEST_DATA.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,3001,"[{'id': 34055, 'name': 'Pokémon Collection', '...",0,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",http://www.pokemon.com/us/movies/movie-pokemon...,tt1226251,ja,ディアルガVSパルキアVSダークライ,Ash and friends (this time accompanied by newc...,3.85,...,"[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",7/14/07,90.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Somewhere Between Time & Space... A Legend Is ...,Pokémon: The Rise of Darkrai,"[{'id': 11451, 'name': 'pok√©mon'}, {'id': 115...","[{'cast_id': 3, 'character': 'Tonio', 'credit_...","[{'credit_id': '52fe44e7c3a368484e03d683', 'de..."
1,3002,,88000,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",,tt0051380,en,Attack of the 50 Foot Woman,When an abused wife grows to giant size becaus...,3.56,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5/19/58,65.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A titanic beauty spreads a macabre wave of hor...,Attack of the 50 Foot Woman,"[{'id': 9748, 'name': 'revenge'}, {'id': 9951,...","[{'cast_id': 2, 'character': 'Nancy Fowler Arc...","[{'credit_id': '55807805c3a3685b1300060b', 'de..."
2,3003,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0118556,en,Addicted to Love,Good-natured astronomer Sam is devastated when...,8.09,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5/23/97,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Comedy About Lost Loves And Last Laughs,Addicted to Love,"[{'id': 931, 'name': 'jealousy'}, {'id': 9673,...","[{'cast_id': 11, 'character': 'Maggie', 'credi...","[{'credit_id': '52fe4330c3a36847f8041367', 'de..."
3,3004,,6800000,"[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...",http://www.sonyclassics.com/incendies/,tt1255953,fr,Incendies,A mother's last wishes send twins Jeanne and S...,8.6,...,"[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",9/4/10,130.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,The search began at the opening of their mothe...,Incendies,"[{'id': 378, 'name': 'prison'}, {'id': 539, 'n...","[{'cast_id': 6, 'character': 'Nawal', 'credit_...","[{'credit_id': '56478092c3a36826140043af', 'de..."
4,3005,,2000000,"[{'id': 36, 'name': 'History'}, {'id': 99, 'na...",,tt0418753,en,Inside Deep Throat,"In 1972, a seemingly typical shoestring budget...",3.22,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2/11/05,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was filmed in 6 days for 25 thousand dollar...,Inside Deep Throat,"[{'id': 279, 'name': 'usa'}, {'id': 1228, 'nam...","[{'cast_id': 1, 'character': 'Narrator (voice)...","[{'credit_id': '52fe44ce9251416c75041967', 'de..."


## DISPLAYING TRAINING DATA

In [4]:
IMDB_MOVIE_TRAIN_DATA.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.58,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.25,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.3,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.17,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.15,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


##  Prepare Data

In [8]:
# This function will be called later to prepare our input data
def prepare_data(df):
    # a.) Use the `id` feature as the index column of the data frame
    df = df.set_index('id')

    # b.) Only use easy to process features
    #  Warning: huge information loss here, you should propably include more features in your production code.
    df = df[['budget', 'original_language' ,'popularity', 'runtime', 'status']]
    
    # c.) One-Hot-Encoding for all nominal data
    df = pd.get_dummies(df)
    
    # d.) The `runtime` feature is not filled in 2 of the rows. We replace those empty cells / NaN values with a 0.
    #  Warning: in production code, please use a better method to deal with missing cells like interpolation or additional `is_missing` feature columns.
    return df.fillna(0)


# 1.) Extract the target variable `revenue` and use the `id` column as index of that data frame
df_train_y = IMDB_MOVIE_TRAIN_DATA[['id','revenue']].set_index('id')

# 2.) Prepare the training and test data by using the function we defined above
df_train_x = prepare_data(IMDB_MOVIE_TRAIN_DATA)
df_test_x  = prepare_data(IMDB_MOVIE_TEST_DATA)

# 3.) Create columns in train/test dataframes if they only exist in one of them (can happen through one hot encoding / get_dummies)
#  Example: There are no status=`Post Production` entries in the training set, but there are some in the test set.
df_train_x, df_test_x = df_train_x.align(df_test_x, join='outer', axis=1, fill_value=0)

# 4.) Show the first rows of one of the prepared tables
df_train_x.head(2)

Unnamed: 0_level_0,budget,original_language_af,original_language_ar,original_language_bm,original_language_bn,original_language_ca,original_language_cn,original_language_cs,original_language_da,original_language_de,...,original_language_tr,original_language_ur,original_language_vi,original_language_xx,original_language_zh,popularity,runtime,status_Post Production,status_Released,status_Rumored
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,14000000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,6.58,93.0,0,1,0
2,40000000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,8.25,113.0,0,1,0


##  Predict Values (Linear Regression)

In [9]:
# 1.) Remove table meta data, column names etc. → Just use values for prediction.
train_x = df_train_x.values
train_y = df_train_y.values
test_x  = df_test_x.values

# 2.) Calculate the coefficients of the linear regression / 'Train'
reg = LinearRegression().fit(train_x, train_y)

# 3.) Apply the linear regression model on the prepared test data
test_y = reg.predict(test_x)

# 4.) Test accuracy on training and test set (in production you would propably use an additional validation set)
print('Accuracy on Training Dataset (should be close to 1):', reg.score(train_x, train_y))
print('Accuracy on Test Dataset (should be as high as possible): Check by submitting on kaggle')

Accuracy on Training Dataset (should be close to 1): 0.6151260563337928
Accuracy on Test Dataset (should be as high as possible): Check by submitting on kaggle


##  Convert Prediction to submittable CSV file

In [10]:
# 1.) Add the predicted values to the original test data
df_test = IMDB_MOVIE_TEST_DATA.assign(revenue=test_y)

# 2.) Extract a table of ids and their revenue predictions
df_test_y = df_test[['id','revenue']].set_index('id')

# 3.) save that table to a csv file. On Kaggle, the file will be visible in the 'output' tab if the kernel has been commited at least once.
df_test_y[['revenue']] = df_test_y[['revenue']].abs()
df_test_y.to_csv('submission_linreg.csv')

# 4.) output the head of our file her to check if it looks good :)
pd.read_csv('submission_linreg.csv').head()


Unnamed: 0,id,revenue
0,3001,2174278.26
1,3002,8976437.32
2,3003,7927845.64
3,3004,25903345.3
4,3005,1038570.77


## Decision Tree

In [22]:
pd.set_option('display.float_format', '{:,.2f}'.format)

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

movie = pd.concat([train, test], sort=False)

movie = movie.fillna(0)

In [23]:
def prepare_data(df):
    # a.) Use the `id` feature as the index column of the data frame
    df = df.set_index('id')

    # b.) Only use easy to process features
    #  Warning: huge information loss here, you should propably include more features in your production code.
    df = df[[
        'popularity',
        'runtime',
        'budget'
    ]]
    
    # c.) One-Hot-Encoding for all nominal data
    df = pd.get_dummies(df)
    
    # d.) The `runtime` feature is not filled in 2 of the rows. We replace those empty cells / NaN values with a 0.
    #  Warning: in production code, please use a better method to deal with missing cells like interpolation or additional `is_missing` feature columns.
    return df.fillna(0)

# 1.) Extract the target variable `revenue` and use the `id` column as index of that data frame
df_train_y = movie[['id','revenue']].set_index('id')

# 2.) Prepare the training and test data by using the function we defined above
df_train_x = prepare_data(movie)
df_test_x  = prepare_data(movie)

# 3.) Create columns in train/test dataframes if they only exist in one of them (can happen through one hot encoding / get_dummies)
#  Example: There are no status=`Post Production` entries in the training set, but there are some in the test set.
df_train_x, df_test_x = df_train_x.align(df_test_x, join='outer', axis=1, fill_value=0)

# 4.) Show the first rows of one of the prepared tables
df_train_x.head()

Unnamed: 0_level_0,popularity,runtime,budget
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6.58,93.0,14000000
2,8.25,113.0,40000000
3,64.3,105.0,3300000
4,3.17,122.0,1200000
5,1.15,118.0,0


In [24]:
# 1.) Remove table meta data, column names etc. → Just use values for prediction.
train_x = df_train_x.values
train_y = df_train_y.values
test_x = df_test_x.values

In [25]:
dt = DecisionTreeClassifier()
dt.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
test_y = dt.predict(test_x)

In [27]:
print('Accuracy on Training Dataset:', dt.score(train_x, train_y))

Accuracy on Training Dataset: 1.0


In [28]:
# 1.) Add the predicted values to the original test data
df_test = movie.assign(revenue=test_y)

# 2.) Extract a table of ids and their revenue predictions
df_test_y = df_test[['id','revenue']].set_index('id')

# 3.) save that table to a csv file. On Kaggle, the file will be visible in the 'output' tab if the kernel has been commited at least once.
df_test_y[['revenue']] = df_test_y[['revenue']].abs()
df_test_y.to_csv('submission_dt.csv')

# 4.) output the head of our file her to check if it looks good :)
dt_results = pd.read_csv('submission_dt.csv')

dt_results.head()

Unnamed: 0,id,revenue
0,1,12314651.0
1,2,95149435.0
2,3,13092000.0
3,4,16000000.0
4,5,3923970.0


## Random Forest

In [29]:
rf = RandomForestClassifier()
rf.fit(train_x, train_y)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
test_y = rf.predict(test_x)
print('Accuracy on Training Dataset:', rf.score(train_x, train_y))

Accuracy on Training Dataset: 0.9609353879426872


In [31]:
# 1.) Add the predicted values to the original test data
df_test = movie.assign(revenue=test_y)

# 2.) Extract a table of ids and their revenue predictions
df_test_y = df_test[['id','revenue']].set_index('id')

# 3.) save that table to a csv file. On Kaggle, the file will be visible in the 'output' tab if the kernel has been commited at least once.
df_test_y[['revenue']] = df_test_y[['revenue']].abs()
df_test_y.to_csv('submission_rf.csv')

# 4.) output the head of our file her to check if it looks good :)
rf_results = pd.read_csv('submission_rf.csv')

rf_results.head()

Unnamed: 0,id,revenue
0,1,12314651.0
1,2,95149435.0
2,3,13092000.0
3,4,16000000.0
4,5,3923970.0
