# TMDB Movie Data Exploration
The goal of the project is to derive insights on the TMDB movie datset and perform regression models to predict revenue of the movie. This model could be leveraged by production companies for making go/no-go screening decisions.

TMDB Movie Dataset available on Kaggle. Link: https://www.kaggle.com/tmdb/tmdb-movie-metadata

In [2]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
%matplotlib inline
plt.style.use('ggplot')
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mayura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import json
import ast
from urllib.request import urlopen
from PIL import Image
import time

In [None]:

import os
def text_to_dict(df):
    for column in dict_columns: 
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

## 1 Load TMDB datset

In [None]:
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [None]:
LOST_COLUMNS = ['actor_1_facebook_likes','actor_2_facebook_likes','actor_3_facebook_likes','aspect_ratio',
    'cast_total_facebook_likes','color','content_rating','director_facebook_likes', 'facenumber_in_poster',
    'movie_facebook_likes','movie_imdb_link','num_critic_for_reviews','num_user_for_reviews']

TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {'budget': 'budget','genres': 'genres','revenue': 'gross','title': 'movie_title',
    'runtime': 'duration','original_language': 'language', 'keywords': 'plot_keywords','vote_count': 'num_voted_users'}

IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}

## 1.1 Extracting Strings from columns - nested JSON
The Movie and credit data contain columns of nested JSON which need to be split into separate columns for accurate analysis.

In [None]:
def safe_access(container, index_values):
    # return missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan

def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])

def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])

def convert_to_original_format(movies, credits):
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    #tmdb_movies['year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['company'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director'] = credits['crew'].apply(get_director)
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [4]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math, nltk, warnings
from nltk.corpus import wordnet
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors

from wordcloud import WordCloud, STOPWORDS
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings('ignore')
PS = nltk.stem.PorterStemmer()


In [None]:

#credits = load_tmdb_credits("tmdb_5000_credits.csv")
movies = load_tmdb_movies("movies_metadata.csv")
df_movies = convert_to_original_format(movies, credits)
print('Shape:',df_movies.shape)

tab_info=pd.DataFrame(df_movies.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(df_movies.isnull().sum()).T.rename(index={0:'null values'}))
tab_info=tab_info.append(pd.DataFrame(df_movies.isnull().sum()/df_movies.shape[0]*100).T.
                         rename(index={0:'null values (%)'}))
tab_info

## 1.2 Exploratory Analysis

In [None]:
df_movies.head(5)

###  1.2.1 Missing Values 

In [None]:
missing_df = df_movies.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['filling_factor'] = (df_movies.shape[0] 
                                - missing_df['missing_count']) / df_movies.shape[0] * 100
missing_df.sort_values('filling_factor').reset_index(drop = True)

In [None]:
def word(df, ref_col, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0
    for liste_keywords in df[ref_col].str.split('|'):        
        if type(liste_keywords) == float and pd.isnull(liste_keywords): continue        
        for s in [s for s in liste_keywords if s in liste]: 
            if pd.notnull(s): keyword_count[s] += 1
                
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

### 1.2.2 Genre Extraction and Analysis per year

In [None]:
g_labels = set()
for s in df_movies['genres'].str.split('|').values:
    g_labels = g_labels.union(set(s))

In [None]:
occurences, dum = word(df_movies, 'genres', g_labels)
occurences[:5]

In [None]:
occurences = [x for x in occurences if x[0]]
occurences

### Keywords

In [5]:
df_movies = pd.read_csv("movies_metadata.csv")

In [6]:
df_movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
set_keywords = set()
for liste_keywords in df_movies['plot_keywords'].str.split('|').values:
    if isinstance(liste_keywords, float): continue  # only happen if liste_keywords = NaN
    set_keywords = set_keywords.union(liste_keywords)

set_keywords.remove('')

KeyError: 'plot_keywords'

In [None]:
keyword_occurences, dum = word(df_movies, 'plot_keywords', set_keywords)
keyword_occurences[:5]

In [None]:

def random_color(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(200.0 * tone / 100.0)
    s = int(200.0 * 255.0 / 100.0)
    l = int(200.0 * float(random_state.randint(20, 70)) / 100.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

fig = plt.figure(1, figsize=(18,13))
ax1 = fig.add_subplot(2,1,1)

words = dict()
trunc_occurences = keyword_occurences[0:20]
for s in trunc_occurences:
    words[s[0]] = s[1]
tone = 55.0 # define the color of the words

wordcloud = WordCloud(width=1000,height=300, background_color='black', 
                      max_words=1628,relative_scaling=1,
                      color_func = random_color,
                      normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.axis('off')


In [None]:
df_movies['genres']

In [None]:
def parse(col):

    # pandas series to list
    entries = list(col)
    
    # handling "|" separator and removing duplicates
    collect = [] #this will contain all the unique genres
    
    for entry in entries:
        for _ in entry.split("|"):
            if _ not in collect:
                collect.append(_)
    return(collect)

In [None]:
parse(df_movies['genres'])

In [None]:
year = df_movies['year'].unique()

In [None]:
n_df = {key:[] for key in parse(df_movies['genres'])}


In [None]:
n_df['year'] = year


In [None]:
for y in year: # to get genre count for each year
    
    #subsetting for corresponding year
    y_df_movies = df_movies[df_movies['year'] == y]
    
    # converting pandas series to column
    genres_i1 = list(y_df_movies['genres'])  
    genres_f1 = [] # this will contain all the genres that we see for a given year(with repetition and "|" separator)
    
    for genre in genres_i1: # for splitting every entry in y_df[genres] with separator as "|" 
        for i in genre.split("|"):
            genres_f1.append(i)

    n_list = Counter(genres_f1) # occurrence of each genre in a year
    for genre in parse(df_movies['genres']): #this will create occurrence of each genre in a year
        if genre not in genres_f1:
            n_df[genre].append(0)
        else:
            n_df[genre].append(n_list[genre])

In [None]:
n_df = pd.DataFrame(n_df, index = n_df['year'])

In [None]:
n_df.columns
del(n_df['year'])

In [None]:

n_df.head()

What this dataframe shows?

Let's see first row. If there are n number of movies released in 1960, then entries corresponding to 2016 shows how many times
each genre appeared in 2016.
Let's see this numerically:

In [None]:
total_occ = n_df.loc[2016].sum()
total_occ

In [None]:
action_occ = n_df.loc[2016,'Action']
action_occ

In [None]:
df1 = {key:[] for key in parse(df_movies['genres'])}


In [None]:
df1['year'] = year


In [None]:
for index,row in n_df.iterrows():
    for genre in list(n_df.columns):
        df1[genre].append((100*row[genre]/sum(row)))

In [None]:
df1 = pd.DataFrame(df1)
df1.head(5)

In [None]:
cols = df1.columns.tolist()
cols = cols[-1:] + cols[:-1]
df1 = df1[cols]
df1.columns

### Top five Genre 

In [None]:
top = []
for _ in df1.columns[1:]:
    top.append(np.mean(df1[_]))
    top.sort(reverse = True)
top[:5]

In [None]:
import seaborn as sns
g = ['Drama','Comedy','Action']
for genre in g:
    sns.jointplot(x=df1['year'], y= df1[genre]);
    #plot = sns.regplot(y = new_df1[genre], x = new_df1['year'], lowess = True);
#plot.set_ylabel("");
#plot.axvline(x = 2016, color = 'black', alpha = 1.0);


## 1.2.3  Comparing Revenue with vote_count, popularity and runtime

GGPlot using rpy2 for better visual analysis.
Here the plot analysis shows that the highly popular movies with high revenue has duration of 1hr40 mins

In [None]:
import pandas as pd
import rpy2
from plotnine import *
# the base of rpy2 plotting is matplotlib, thus we need to declare
# it inline in order to see the plots in the notebook
%matplotlib inline
# we need to activate the automatic conversion for pandas

In [None]:

ggplot(aes(x='duration', y='gross', color='num_voted_users'), data=df_movies) +\
    geom_point() +\
    theme_bw() +\
    xlab("runtime") +\
    ylab("revenue") +\
    ggtitle("Revenue vs runtime")


In [None]:

ggplot(aes(x='popularity', y='gross', color='num_voted_users'), data=df_movies) +\
    geom_point() +\
    theme_bw() +\
    xlab("popularity") +\
    ylab("gross") +\
    ggtitle("Revenue vs popularity")

The plot analysis shows that the top movies that have high revenue have high popularity between the frequency of 500-750 and votes from 5000-10000 

In [None]:
ggplot(aes(x='budget', y='gross', color='num_voted_users'), data=df_movies) +\
    geom_point() +\
    theme_bw() +\
    xlab("budget") +\
    ylab("gross") +\
    ggtitle("Revenue vs Budget")

The plot analysis shows that the budget directly affects the revenue.
If the movies is released with high budget the revenue is accordingly high

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10, 10)
#ax = sns.boxplot(x="open_gross", y="genre", data=df0, palette=sns.light_palette((210, 90, 60), input="husl"))
#ax = sns.boxplot(x="open_gross", y="genre", data=df0, palette="GnBu_d")
ax = sns.boxplot(x="status", y="gross", data=df_movies, palette="icefire")
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)
plt.show()

## 1.2.4 Caluclating movies with high profit and low profit

 the budget of high amd low profit movie is nearly 2 million.

In [None]:
df_movies['Profit'] = df_movies['gross'] - df_movies['budget']
X=df_movies['Profit']

In [None]:
def find_minmax(x):
    #use the function 'idmin' to find the index of lowest profit movie.
    min_index = df_movies[x].idxmin()
    #use the function 'idmax' to find the index of Highest profit movie.
    high_index = df_movies[x].idxmax()
    high = pd.DataFrame(df_movies.loc[high_index,:])
    low = pd.DataFrame(df_movies.loc[min_index,:])
    
    #print the movie with high and low profit
    print("Movie Which Has Highest "+ x + " : ",df_movies['original_title'][high_index])
    print("Movie Which Has Lowest "+ x + "  : ",df_movies['original_title'][min_index])
    return pd.concat([high,low],axis = 1)

#call the find_minmax function.
find_minmax('Profit')

### Top 20 Profitable Movies according to the year of release

In [None]:
#make a plot which contain top 10 movies which earn highest profit.
#sort the 'Profit' column in decending order and store it in the new dataframe,
info = pd.DataFrame(df_movies['Profit'].sort_values(ascending = False))
info['original_title'] = df_movies['original_title']
data = list(map(str,(info['original_title'])))
x = list(data[:20])
y = list(info['Profit'][:20])

#make a plot usinf pointplot for top 10 profitable movies.
ax = sns.barplot(x=y,y=x)

#setup the figure size
sns.set(rc={'figure.figsize':(10,5)})
#setup the title and labels of the plot.
ax.set_title("Top 20 Profitable Movies",fontsize = 15)
ax.set_xlabel("Profit",fontsize = 13)
sns.set_style("darkgrid")

In [None]:
ax = sns.regplot(x=df_movies['Profit'], y=df_movies['budget'],color='b')

#setup the title and the labels of the plot.
ax.set_title("Profit Vs Budget",fontsize=13)
ax.set_xlabel("Profit",fontsize=12)
ax.set_ylabel("Budget",fontsize=12)

#setup the figure size and style sheet of the plot.
sns.set(rc={'figure.figsize':(20,7)})
sns.set_style("whitegrid")


### 1.2.2.1 Average Movie Rating

In [None]:
ggplot(aes(x='vote_average'), data=df_movies) + \
    geom_histogram()

The Histogram plot shows that the average TMDBscore is between 5-7.5 and the overall average is 6.5.

## 1.2.5 Calculating movies with high and low budget

In [None]:
df_movies['budget'] = df_movies['budget'].replace(0,np.NAN)
find_minmax('budget')

In [None]:
info = pd.DataFrame(df_movies['budget'].sort_values(ascending = False))
info['original_title'] = df_movies['original_title']
data = list(map(str,(info['original_title'])))

#extract the top 10 budget movies data from the list and dataframe.
x = list(data[:20])
y = list(info['budget'][:20])

#plot the figure and setup the title and labels.
ax = sns.barplot(x=y,y=x)
sns.set(rc={'figure.figsize':(10,5)})
ax.set_title("Top 20 High Budget Movies",fontsize = 15)
ax.set_xlabel("Budget",fontsize = 13)
sns.set_style("darkgrid")

##  1.2.6 Analysis of how runtime affect popularity

In [None]:
info = pd.DataFrame(df_movies['duration'].sort_values(ascending = False))
info['popularity'] = df_movies['popularity']
data = list(map(int,(info['popularity'])))

#extract the top 10 movies with high revenue data from the list and dataframe.
x = list(data[:20])
y = list(info['duration'][:20])

#make the point plot and setup the title and labels.
ax = sns.pointplot(x=y,y=x)
sns.set(rc={'figure.figsize':(25,5)})
ax.set_title("Top 20 Popular Movies",fontsize = 15)
ax.set_xlabel("Runtime",fontsize = 13)
#sns.set_style("ticks")
sns.set_style("darkgrid")

The plot shows that popularity of movies that have duration of about 3 hours is high and as  the duration increases the popularity decreases

Thus a movie's popluarity also depends on the duration. 

## 1.2.7  Analysis of movie release per year

In [None]:
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'], infer_datetime_format=True)
df_movies['release_day'] = df_movies['release_date'].apply(lambda t: t.day)
df_movies['release_weekday'] = df_movies['release_date'].apply(lambda t: t.weekday())
df_movies['release_month'] = df_movies['release_date'].apply(lambda t: t.month)

# Year was being interpreted as future dates in some cases so I had to adjust some values
df_movies['release_year'] = df_movies['release_date'].apply(lambda t: t.year if t.year < 2018 else t.year -100)

In [None]:
df_movies[['release_date','release_day','release_weekday','release_month','release_year']].head()

In [None]:
fig = sns.countplot(df_movies.release_year)
fig.set(ylabel='number of movies')
for index, label in enumerate(fig.xaxis.get_ticklabels()):
    if index % 8 != 0:
        label.set_visible(False)

The plot shows that our dataset contains movies released during the year between 2008-2016 has high number of release.

In [None]:
month_release = df_movies['release_date'].dt.month

#count the movies in each month using value_counts().
number_of_release = month_release.value_counts().sort_index()
months=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
number_of_release = pd.DataFrame(number_of_release)
number_of_release['month'] = months

#change the column name of the new dataframe 'number_of_release'
number_of_release.rename(columns = {'release_date':'number_of_release'},inplace=True)

#plot the bar graph using plot.
number_of_release.plot(x='month',kind='bar',fontsize = 11,figsize=(8,6))

#set the labels and titles of the plot.
plt.title('Months vs Number Of Movie Releases',fontsize = 15)
plt.xlabel('Month',fontsize = 13)
plt.ylabel('Number of movie releases',fontsize = 13)
sns.set_style("darkgrid")

Accordingly the number of movies released during the end of the year is high between 2008-2016.

### 1.2.7 ggplot analysis over popularity vs language

In [None]:
ggplot(df_movies, aes(x='popularity', fill='country')) + geom_histogram()

The plot explaines that Most of the movies released in English are highly popular 

## 2 Budget vs Revenue Analysis

In [None]:
df_movies['log_gross'] = np.log(df_movies['gross'])

In [None]:
median = df_movies["budget"].median()
df_movies["budget"].fillna(median, inplace=True) 

In [None]:
sns.distplot(df_movies['budget'])

In [None]:
df_movies['log_budget'] = np.log(df_movies['budget'])

In [None]:
ggplot(aes(x = 'budget', y = 'gross'), data = df_movies) +\
  geom_point(alpha = 0.9, position =  "jitter") +\
  geom_smooth(method = 'lm', color = 'red') +\
  ylab('gross') +\
  xlab('budget') +\
    ggtitle('Budget Vs Revenue')

In [None]:
ggplot(aes(x = 'log_gross', y = 'vote_average'), data = df_movies) +\
  geom_point(alpha = 0.9, position =  "jitter") +\
  geom_smooth(method = 'lm', color = 'red') +\
  ylab('tmdb score') +\
  xlab('gross') +\
   ggtitle("Revenue vs TMDB_score")

## 2.1 Desity Plot analysis on Budget 

In [None]:
ggplot(df_movies, aes(x='budget', color='status')) + \
    geom_density()


In [None]:
ggplot(aes(x = 'log_budget', y = 'vote_average'), data = df_movies) +\
  geom_point(alpha = 0.9, position =  "jitter") +\
  geom_smooth(method = 'lm', color = 'red') +\
  ylab('TMDB_score') +\
  xlab('budget') +\
   ggtitle('TMDB score vs Budget')


## 3. Correlation Analysis

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
corrmat = df_movies.dropna(axis=0, how='any').corr()

ax = sns.heatmap(corrmat, annot=True, fmt='.2f', annot_kws={'size': 10}, center=1)
plt.show()

A barplot on Movies released in cuntry vs their popularity. 

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_movies.info()

## Linear Model

In [None]:
data = pd.read_csv("tmdb_5000_movies.csv")
dict_columns = [ 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'keywords']
data = text_to_dict(data)

data['list_of_genres'] = list(data['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
data = data.drop(['genres'], axis=1)
#Production_companies
data['production_companies_names'] = list(data['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
data = data.drop(['production_companies'], axis=1)
#production_countries
data['production_countries_names'] = list(data['production_countries'].apply(lambda x: [i['iso_3166_1'] for i in x] if x != {} else []).values)
data = data.drop(['production_countries'], axis=1)
#spoken_languages
data['spoken_languages_codes'] = list(data['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
data = data.drop(['spoken_languages'], axis=1)
#Keywords
data['Keyword_names'] = (data['keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
data = data.drop(['keywords'], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#original_title
le.fit(list(data['original_title'].fillna('')))
data['original_title'] = le.transform(data['original_title'].fillna('').astype(str))
#production_companies_names
le.fit(list(data['production_companies_names'].fillna('').astype(str)))
data['production_companies_names'] = le.transform(data['production_companies_names'].fillna('').astype(str))
#production_countries_names
le.fit(list(data['production_countries_names'].fillna('').astype(str)))
data['production_countries_names'] = le.transform(data['production_countries_names'].fillna('').astype(str))
#spoken_languages_codes
le.fit(list(data['spoken_languages_codes'].fillna('').astype(str)))
data['spoken_languages_codes'] = le.transform(data['spoken_languages_codes'].fillna('').astype(str))
#Keyword_names
le.fit(list(data['Keyword_names'].fillna('').astype(str)))
data['Keyword_names'] = le.transform(data['Keyword_names'].fillna('').astype(str))
#Keyword_names
le.fit(list(data['list_of_genres'].fillna('').astype(str)))
data['list_of_genres'] = le.transform(data['list_of_genres'].fillna('').astype(str))

In [None]:
data = data[['budget','popularity','runtime','revenue','list_of_genres','production_countries_names','status']]

In [None]:
data.isna().sum()

In [None]:
median = data["runtime"].median()
data["runtime"].fillna(median, inplace=True) 

In [None]:
data=data.dropna()
data

In [None]:
one_hot = pd.get_dummies(data['status'])
data = data.drop('status', axis=1)
data = data.join(one_hot)

In [None]:
from sklearn.model_selection import train_test_split
training_set, validation_set = train_test_split(data, test_size = 0.2, random_state = 21)

#classifying the predictors and target variables as X and Y
X_train = training_set.iloc[:,[1,2,3,4,5,6,7,8]].values
Y_train = training_set.iloc[:,0].values
Y_train=Y_train.astype('int')
X_val = validation_set.iloc[:,[1,2,3,4,5,6,7,8]].values
Y_val = validation_set.iloc[:,0].values
Y_val=Y_val.astype('int')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
X_tr_scale=scaler.fit_transform(X_train)
X_te_scale=scaler.fit_transform(X_val)

X_tr_scale[:,:5]

In [None]:
from sklearn import preprocessing
from sklearn import pipeline

pipeline_tr = pipeline.Pipeline([ ('scaler',StandardScaler())])

pipe=pipeline_tr.fit_transform(data)
pipe

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
def rmse(a, b):
    return np.sqrt(np.mean((a-b)**2))
pd.options.display.max_columns = None
np.random.seed(0)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

linear=LinearRegression()
linear.fit(X_tr_scale,Y_train)

In [None]:
y_pred=linear.predict(X_te_scale)

In [None]:
print(rmse(Y_val,y_pred))

In [None]:
print(linear.score(X_te_scale,Y_val))

In [None]:
predicteddf = pd.DataFrame({'Actual': Y_val.flatten(), 'Predicted': y_pred.flatten()})
predicteddf.sample(5)

In [None]:
plotresult = predicteddf.head(25)
plotresult.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_val, y_pred)))

In [None]:
fig, ax = plt.subplots()
ax.scatter(Y_val, y_pred)
ax.plot([Y_val.min(), Y_val.max()], [Y_val.min(),Y_val.max()], lw=1)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

### RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
param_grid = [{'n_estimators': [4, 5, 10, 20, 50]}]

rf = RandomForestRegressor()
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search_rf.fit(X_tr_scale, Y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
grid_search_rf.best_score_

In [None]:
y_pred_t=grid_search_rf.predict(X_te_scale)

In [None]:
print(rmse(Y_val,y_pred_t))

### LGBTM Regressor

In [None]:
from lightgbm import LGBMRegressor

In [None]:
def rmsle(y_true, y_pred):
    return 'rmsle', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

In [None]:
lr = LGBMRegressor(boosting_type='dart',num_leaves=20,max_depth=-1,min_data_in_leaf=20, learning_rate=0.2,n_estimators=500,subsample_for_bin=200000,
                   class_weight=None,min_split_gain=0.0,min_child_weight=0.001,subsample=0.1,subsample_freq=0,colsample_bytree=0.75,reg_alpha=0.0,reg_lambda=0.0,
                   random_state=101,n_jobs=-1)
lr.fit(X_tr_scale, Y_train,eval_set=[(X_te_scale, Y_val)],eval_metric=rmsle,verbose=False)
y_pred = lr.predict(X_te_scale, num_iteration=lr.best_iteration_)

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(Y_val, y_pred))
print('MSE:', metrics.mean_squared_error(Y_val, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_val, y_pred)))
print('RMSLE:', rmsle(Y_val, y_pred))

In [None]:
fig, ax = plt.subplots()
ax.scatter(Y_val, y_pred)
ax.plot([Y_val.min(), Y_val.max()], [Y_val.min(), Y_val.max()], lw=1)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
print(lr.score(X_te_scale,Y_val))

### 4. Classification Model 

Data Cleaning and Processing

### DecisionTree

In [None]:
data = df_movies[['budget','gross','country','num_voted_users']]


In [None]:
one_hot = pd.get_dummies(data['country'])
data = data.drop('country', axis=1)
data = data.join(one_hot)

In [None]:
data.dtypes

In [None]:
data['gross']

Revenue is classified into 4 classes based on their distribution

In [None]:
def generate_label(df):
    conditions = [
        (df['gross'] <= 1000000),
        (df['gross'] > 1000000) & (df['gross'] <= 25000000),
        (df['gross'] > 25000000) & (df['gross'] <= 100000000),
        (df['gross'] > 100000000) & (df['gross'] <= 300000000)]
    choices=[0,1,2,3]
    df['label']=np.select(conditions, choices, default=4)
generate_label(data)

In [None]:
bin_classes = [0]*5
for label in data['label']:
    bin_classes[int(label)]+=1
print(bin_classes)
plt.bar(np.arange(5), bin_classes)
plt.xticks(np.arange(5), np.arange(1,10))
plt.xlabel('Class')
plt.ylabel('Number of movies')
plt.show()

In [None]:
data.isna().sum()

In [None]:
data=data.dropna()

In [None]:
data = data.drop(columns=['gross'])


In [None]:
data.to_csv("new_data.csv", index=False)

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("new_data.csv")
all_data = data.values
#all_data = np.delete(all_data, [0,3], axis=1)

### 4.1 PreProcessing

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
# normalize budget, release year, runtime, grosses
normalized_values = scaler.fit_transform(all_data[:,1:7])
# stack them together
all_data = np.hstack((all_data[:,:1], normalized_values, all_data[:,7:]))

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(all_data[:, :-1], all_data[:,-1], 
            test_size=0.2, shuffle=True, random_state=418)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
set(y_train)

In [None]:
bin_classes = [0]*5
for label in y_test:
    bin_classes[int(label)]+=1
print(bin_classes)
plt.bar(np.arange(5), bin_classes)
plt.xticks(np.arange(5), np.arange(1,10))
plt.xlabel('Class')
plt.ylabel('Number of movies')
plt.show()

### 4.2 DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV


clf = DecisionTreeClassifier(criterion='entropy',max_depth=3)
clf = clf.fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred_t = clf.predict(x_test)
accuracy_score(y_test, y_pred_t)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test, y_pred_t))
print(classification_report(y_test, y_pred_t))

## 4.3 KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

numNeighbors = [ 5, 10, 20, 100]
testAcc = []
trainAcc = []

for k in numNeighbors:
    clf = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    clf.fit(x_train, y_train)
    knn_pred = clf.predict(x_test)
    knn_pred_train = clf.predict(x_train)
    #print(knn_pred)
    testAcc.append(accuracy_score(y_test, knn_pred))
    trainAcc.append(accuracy_score(y_train,knn_pred_train))
    print(confusion_matrix(y_test, knn_pred))
    print(classification_report(y_test, knn_pred))

plt.plot(numNeighbors, testAcc,'bv--',numNeighbors, trainAcc, 'ro--')
plt.legend(['Test Accuracy','Train Accuacy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')

### 4.4 RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimators = [10, 20, 50]
testAcc = []
trainAcc = []

for k in estimators:
    clf = RandomForestClassifier(n_estimators=k)
    clf.fit(x_train, y_train)
    rand_pred = clf.predict(x_test)
    rand_pred_train = clf.predict(x_train)
    #print(rand_pred)
    testAcc.append(accuracy_score(y_test, rand_pred))
    trainAcc.append(accuracy_score(y_train,rand_pred_train))
    print(confusion_matrix(y_test, rand_pred))
    print(classification_report(y_test, rand_pred))

plt.plot(estimators, testAcc,'bv--',estimators, trainAcc, 'ro--')
plt.legend(['Test Accuracy','Train Accuacy'])
plt.xlabel('Estimators')
plt.ylabel('Accuracy')

### 4.5 MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

C = [5, 20, 32, 50, 100, 500]

LRtestAcc = []
LRtrainAcc = []

for param in C:
    classifier = MLPClassifier(hidden_layer_sizes=param, max_iter=50,activation = 'relu',solver='adam',random_state=1)
    classifier.fit(x_train,y_train)
    log_reg_pred_train = classifier.predict(x_train)
    log_reg_pred = classifier.predict(x_test)
    #print(log_reg_pred)
    LRtestAcc.append(accuracy_score(y_test, log_reg_pred))
    LRtrainAcc.append(accuracy_score(y_train,log_reg_pred_train))
    print(confusion_matrix(y_test, log_reg_pred))
    print(classification_report(y_test, log_reg_pred))

    

plt.plot(C, LRtestAcc,'bv--',C,LRtrainAcc,'ro--')
plt.legend(['Test Accuracy','Train Accuracy'])
plt.xlabel('C')
plt.xscale('log')
plt.ylabel('Accuracy')  

## 4.6 SVM

In [None]:
from sklearn.svm import SVC
C = [5,10,20]
 

SVMLtestAcc = []
SVMLtrainAcc = []



for param in C:
    clf = SVC(C=param,kernel='rbf',gamma='auto')
    clf.fit(x_train,y_train)
    svml_pred = clf.predict(x_test)
    svml_pred_train = clf.predict(x_train)
    #print(svml_pred)
    SVMLtestAcc.append(accuracy_score(y_test, svml_pred))
    SVMLtrainAcc.append(accuracy_score(y_train,svml_pred_train))
    print(confusion_matrix(y_test, svml_pred))
    print(classification_report(y_test, svml_pred))
plt.plot(C, SVMLtestAcc,'ro--', C,SVMLtrainAcc,'bv--')
plt.legend(['Test Accuracy','Train Accuracy'])
plt.xlabel('C')
plt.xscale('log')
plt.ylabel('Accuracy')  