In [None]:
import io, time, json
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import json
import sklearn
from tensorflow.keras.utils import to_categorical
import warnings; warnings.simplefilter('ignore')
%matplotlib inline



In [None]:
# Grabbing the nominees
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        result: dict, movie name as key, movie information as value
    """
    # remember to use browser header here, or cannot retrieve full data from the website
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(url, headers = headers)
    html = response.content
    soup = BeautifulSoup(html,'lxml')

    result = {}
    for item in soup.find_all('div', {'class':'result-subgroup subgroup-awardcategory-chron'}):
        try:
            award_title = item.find('div',{'class':'result-subgroup-title'}).find('a',{'class':'nominations-link'}).contents[0]
            if award_title == 'BEST PICTURE':
                sub_groups = item.find_all('div',{'class':'result-details awards-result-actingorsimilar'})
                for sub in sub_groups:
                    sub_result = {}
                    film_title = sub.find('div',{'class':'awards-result-film-title'}).find('a',{'class':'nominations-link'}).contents[0]
                    statement = sub.find('div',{'class':'awards-result-nominationstatement'}).find('a',{'class':'nominations-link'}).contents[0]           
                    sub_result['film_title'] = film_title
                    sub_result['statement'] = statement
                    sub_result['is_winner'] = 1
                    if sub.find('span',{'class':'glyphicon glyphicon-star'}):
                        sub_result['winner'] = 1
                    else:
                        sub_result['winner'] = 0
                        
                    result[film_title] = sub_result
                    
        except Exception:
            pass
    return result

- to scrap the data from The Official Academy Awards Database and append it to the current data set.
- If you want to know the alignment of the html framework on Oscar website, you can right click on the Chrome webpage, select “Inspect”, then you will enter the Chrome development mode which you can find the position that the information is located.
- The value in “nominee” will be 1 for all movies since the movies retrieved from Oscar website are all nominees.
- The value in "winner" will be 1 for all winners.


In [None]:
# data of year 2016
best_2016 = retrieve_html('http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardShowFrom%22:89,%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D')

# data of year 2017
best_2017 = retrieve_html('http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardShowFrom%22:90,%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D')

# data of year 2018
best_2018 = retrieve_html('http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardShowFrom%22:91,%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D')


The output should be a json like below 
{'Black Panther': 
  {'film_title': 'Black Panther',
  'statement': 'Kevin Feige, Producer',
  'nominee': 1,
  'winner': 0}}

In [None]:
# Load Data 
#Movies data set contains all movies by years + 2018 nominatted  movies
# credits contains the detailed information of the movies like cast,crew 
#for 2016,2017 and 2018 since there is no data in kaggle we use tmdb api to scrape the movies info
#awards dataset contain all the nominees by category list for the year till 2015, we will append 2016,17 and 2018 next by appending tmdb scarped data
import numpy as np
import pandas as pd
# read from award
df_all_awards = pd.read_csv('database.csv', skiprows = 1, names=['year','ceremony','award','is_winner','winner','movieName','filmInfo'])
df_all_movies = pd.read_csv('movies.csv')
df_all_credits = pd.read_csv('credits.csv')
df2018_all_movies = pd.read_csv('2018_best_picture_tmdb.csv')
df1617_movies = pd.read_csv('2016_2017_best_picture_tmdb.csv')
#df_credits = df_credits.append(df2018_movies)

df_all_awards

In [None]:
# append the data we scrapped to the df_allawards
data = []
for k,v in best_2016.items():
    row = []
    row.append(['2016', 89, 'Best Picture', v['is_winner'],v['winner'], v['film_title'], v['statement']])
    data.append(row[0])
  
for k,v in best_2017.items():
    row = []
    row.append(['2017', 90, 'Best Picture', v['is_winner'],v['winner'], v['film_title'], v['statement']])
    data.append(row[0])

for k,v in best_2018.items():
   row = []
   row.append(['2018', 91, 'Best Picture', v['is_winner'],v['winner'], v['film_title'], v['statement']])
   data.append(row[0])


In [None]:
# pass the column name to the columns!!! Or there will be a runtime error
data
df_all_awards=df_all_awards.append(pd.DataFrame(data,columns=['year','ceremony','award','is_winner','winner','movieName','filmInfo']),ignore_index=True)
#df_all_awards = pd.concat([df_all_awards,data])

df_all_awards['year']= df_all_awards['year'].astype(int)
#df_all_awards['winner']= df_all_awards['winner'].astype(int)
df_all_awards.drop_duplicates(subset=['movieName','year','winner','award'], inplace=True, keep='last')
df_all_awards.head()
#df_all_awards[df_all_awards['movieName']=='Tom Jones ']
#print(data)

In [None]:
# Data Manipulation - Joining the dataframes
# We start by saving only the movies we have credits info for and merge on the same movie id

if 'title' in df_all_credits.columns: 
    df_all_credits = df_all_credits.drop('title',axis=1) 
    
df_all_credits_movies = df_all_credits.set_index('movie_id').join(df_all_movies.set_index('id'))
#df_all_credits_movies

In [None]:
#Convert released date to date format so we can use it later for our max, min date
df2018_all_movies['release_date'] = pd.to_datetime(df2018_all_movies['release_date'], format='%Y-%m-%d')


In [None]:
#Convert released date to date format so we can use it later for our max, min date
df1617_movies['release_date'] = pd.to_datetime(df1617_movies['release_date'], format='%Y-%m-%d')

In [None]:
#we will append our 2016,2017 and 2018 scarped tmdb info on credits like crew,cast to the credits file
df_all_credits_movies = df_all_credits_movies.append(df2018_all_movies)
df_all_credits_movies = df_all_credits_movies.append(df1617_movies)

In [None]:
df_all_credits_movies['release_date'] = pd.to_datetime(df_all_credits_movies['release_date'], format='%Y-%m-%d')


In [None]:
#get the earliest release date from the dataset
min_year = min(df_all_credits_movies['release_date']) #1916-09-04
max_year = max(df_all_credits_movies['release_date']) #2017-02-03
print(min_year)
print(max_year)

In [None]:
df_all_credits_movies['released_year'] = df_all_credits_movies['release_date'].dt.year.fillna(0.0).astype(int)
df_all_credits_movies
df_all_awards[df_all_awards['movieName']=='Tom Jones ']

In [None]:
df_all_credits_movies.to_csv('df_credits_pictures_test.csv', index=False)

In [None]:
# Select movies that are only in Outstanding Picture or Best Picture awards category and create a new df, df_picture_awards
df_all_picture_awards = df_all_awards.loc[df_all_awards['award'].isin(['Outstanding Picture','Best Picture'])]
df_all_picture_awards.reset_index(drop = True, inplace = True)
#332 Rows
#df_all_picture_awards[df_all_picture_awards['movieName']=='My Fair Lady ']
#df_all_credits_movies = df_all_credits_movies.drop(['year2'], axis=1)
# Get only movies that hae been released
df_all_credits_movies = df_all_credits_movies.loc[df_all_credits_movies['status'].isin(['Released'])]
df_all_credits_movies
df_all_picture_awards[df_all_picture_awards['year']=='2016']
print("--------------")
print(df_all_picture_awards.columns)
print("--------------")
print(df_all_credits_movies.columns)



In [None]:
# Replace all the null value in ‘nominee’ column with 1
# df_picture_awards['winner'].isnull().sum()
df_all_picture_awards['is_winner'].fillna(0,inplace = True)
#df_all_picture_awards[df_all_picture_awards['winner']==0]

In [None]:
# join df_all_credits_movies and df_all_picture_awards by movie name

# Change the title in the movie to lower case
import re
movieName = [re.sub(r'[^\w\s]','',x) for x in df_all_picture_awards['movieName'].str.lower().str.strip().values]
title = [re.sub(r'[^\w\s]','',x) for x in df_all_credits_movies['title'].str.lower().str.strip().values]

df_all_picture_awards.loc[:,'movie_title'] = movieName 
df_all_credits_movies.loc[:,'movie_title'] = title

# check the dataframe, we can see the string in movie_title now is valid
#df_all_picture_awards.movie_title

In [None]:
# join the two tables, merge the data
df = df_all_picture_awards.merge(df_all_credits_movies, left_on='movie_title', right_on='movie_title', how='right')
df = df.reset_index(drop=True)
df
#df[df['nominee']==1]

In [None]:
#drop the columns from award table
df.drop('homepage',axis = 1, inplace = True)
df.drop('year',axis = 1, inplace = True)
df.drop('award',axis = 1, inplace = True)
df.drop('movieName',axis = 1, inplace = True)
df.drop('filmInfo',axis = 1, inplace = True)
df.drop('ceremony',axis = 1, inplace = True)
df['winner'].fillna(0,inplace=True) #fill NA with 0

# Print master df to csv
# 4795 rows of movies with appended Best Picture data, that we had crew info for
df.to_csv('masterList_final.csv', index=False)


In [None]:
# We will choose the under-sampling by sampling from the 0-labeled data. 
# We need to first get a subset of nominated data, then sample from the non-nominated data, 
# and finally append the sampled data to the subset of nominated data.
sns.countplot(x='is_winner', data = df)


In [None]:
#Get the number of 0 labeled data, and the number of 1 labeled data
print(len(df.loc[df['is_winner'] == 0])) #4586
print(len(df.loc[df['is_winner'] == 1])) #209

In [None]:

#balance the data
df_subset_0 = df.loc[df['is_winner'] == 0]
df_subset_1 = df.loc[df['is_winner'] == 1]
#df_subset_1['winner']
df_subset_0.drop('is_winner',axis=1,inplace=True)
df_subset_1.drop('is_winner',axis=1,inplace=True)

# #sample with replacement
#df_subset_0 = df_subset_0.sample(150) 
df_subset = pd.concat([df_subset_0,df_subset_1],ignore_index = True)
#df_subset[df_subset['year2']== 2017.0]
df_new = df_subset.merge(df_all_picture_awards, left_on = 'movie_title', right_on = 'movie_title', how = 'left')
df_new = df_new.fillna(0)
df_new
#df_new[df_new['year2']== 2017.0]

In [None]:
print(len(df_new))

In [None]:
#check number of null values in the dataset
df_new = df_new.drop(['winner_y'], axis=1)

In [None]:
df_new = df_new.rename(columns={"winner_x": "winner"})
df_new.isnull().sum()

In [None]:
# correlation analysis
g = sns.heatmap(df_new[['budget','popularity','revenue','runtime','vote_average','vote_count']].corr(),cmap='RdYlGn',annot=True)

In [None]:
# Density Curve
# budget
budget0 = df_new[df_new['is_winner'] == 0]['budget']
budget1 = df_new[df_new['is_winner'] == 1]['budget']

g = sns.kdeplot(budget0, legend = True, shade=True, color='r',label = 'non-nominated')
g = sns.kdeplot(budget1, legend = True, shade=True, color='b', label = 'nonminated')

In [None]:
# revenue
# density curve
revenue0 = df_new[df_new['is_winner'] == 0]['revenue']
revenue1 = df_new[df_new['is_winner'] == 1]['revenue']

g = sns.kdeplot(revenue0, legend = True, shade=True, color='r',label = 'non-nominated')
g = sns.kdeplot(revenue1, legend = True, shade=True, color='b', label = 'nonminated')

In [None]:
# vote count
vc0 = df_new[df_new['is_winner'] == 0]['vote_count']
vc1 = df_new[df_new['is_winner'] == 1]['vote_count']

g = sns.kdeplot(vc0, legend = True, shade=True, color='r',label = 'non-nominated')
g = sns.kdeplot(vc1, legend = True, shade=True, color='b', label = 'nonminated')

In [None]:
# vote average
va0 = df_new[df_new['is_winner'] == 0]['vote_average']
va1 = df_new[df_new['is_winner'] == 1]['vote_average']

g = sns.kdeplot(va0, legend = True, shade=True, color='r',label = 'non-nominated')
g = sns.kdeplot(va1, legend = True, shade=True, color='b', label = 'nonminated')

In [None]:
#popularity
popularity0 = df_new[df_new['is_winner'] == 0]['popularity']
popularity1 = df_new[df_new['is_winner'] == 1]['popularity']

g = sns.kdeplot(popularity0, legend = True, shade=True, color='r',label = 'non-nominated')
g = sns.kdeplot(popularity1, legend = True, shade=True, color='b', label = 'nonminated')

In [None]:
#runtime
runtime0 = df_new[df_new['is_winner'] == 0]['runtime']
runtime1 = df_new[df_new['is_winner'] == 1]['runtime']

g = sns.kdeplot(runtime0, legend = True, shade=True, color='r', label = 'non-nominated')
g = sns.kdeplot(runtime1, legend = True, shade=True, color='b', label = 'nonminated')

In [None]:
#genres
#clean the data
genres_name0 = {}
genres_name1 = {}
genres_set = set()
for i in range(len(df_new)):
    genres = eval(df_new.loc[i,'genres'])
    for it in genres:
        #print(it)
        genres_set.add(it['name'])
        if df_new.loc[i,'is_winner'] == 0:
            if it['name'] not in genres_name0:
                genres_name0[it['name']] = 1
            else:
                genres_name0[it['name']] += 1
        if df_new.loc[i,'is_winner'] == 1:
            if it['name'] not in genres_name1:
                genres_name1[it['name']] = 1
            else:
                genres_name1[it['name']] += 1

genres_array0=[]
genres_array1=[]
for g in genres_set:
    if g in genres_name0:
        genres_array0.append(genres_name0[g])
    else:
        genres_array0.append(0)
    if g in genres_name1:
        genres_array1.append(genres_name1[g])
    else:
        genres_array1.append(0)


        
genres_all = []
genres_all.append(np.array(genres_array0)/sum(genres_array0))
genres_all.append(np.array(genres_array1)/sum(genres_array1))

In [None]:
df_genres = pd.DataFrame(genres_all, columns=list(genres_set))
df_genres[["Adventure",
"Fantasy",
"Animation",
"War",
"Music",
"Documentary",
"Foreign",
"Drama",
"Horror",
"Family",
"Crime",
"Mystery",
"Science Fiction",
"History",
"Action",
"Comedy",
"Romance",
"Western",
"Thriller"]]

In [None]:
# draw stacked bar chart
N = len(df_genres.columns)
ind = np.arange(N)
width = 0.5

plt.figure(figsize=(20,10))
p1 = plt.bar(ind, df_genres.loc[[0]].values[0], width, color='#d62728')
p2 = plt.bar(ind, df_genres.loc[[1]].values[0], width, bottom=df_genres.loc[[0]].values[0])

plt.ylabel('percentage (#genres/#movies)')
plt.title('Percentage by genres and nominations')
plt.xticks(ind,df_genres.columns)
plt.legend((p1[0],p2[0]),('Non-nominees','nominees'))

In [None]:
# Feature Engineering
# Since many columns such as “crew”, “cast” contain information in json format, we need to extract useful information from the columns and then perform one hot encoding.
# We will transform our dataset into a all numeric matrix so that we can feed the data into our machine learning model.
# To look at the structure of column (eg.”cast”), we can use: df.loc[0,’cast’]
import json
def feature_engineering(column_name, df, json_name):
    """
    Args:
        column_name: the column name in the dataframe that contains a json file that needs to conduct feature engineering on
        df: dataframe that perform feature engineering on
        json_name: name in the json file that we want to extract
    
    Returns: new dataframe after feature engineering
    """
    
    name = {}
    fails = []

    for item in df[column_name]:
        group = eval(item)
        #print(type(group))
        for it in group:
            #print (it)
            if it[json_name] not in name:
                name[it[json_name]] = 1
            else:
                name[it[json_name]] += 1
    
    final = {}
    index = 0
    for k,v in name.items():
        if v > 1:
            final[k] = index
            index += 1
    np_item = np.zeros((len(df),len(final)))
    item_dict = {}
    row = 0
    for item in df[column_name]:
        #print(item)
        group = eval(item)
        for it in group:
            if it[json_name] in final:
                index = final[it[json_name]]
                np_item[row][index] = 1
        row += 1

    df_item = pd.DataFrame(np_item, columns = list(final.keys()))
    df_output = pd.concat([df,df_item],axis = 1)
    
    return df_output

In [None]:
df2 = feature_engineering("cast", df_new, "name")
#df2['cast']
df2 = df2.drop(columns=['cast'])

In [None]:
# director
crew_name = {}

for item in df2['crew']:
    crew = eval(item)
    for it in crew:
        if it['job'] == 'Director':
            if it['name'] not in crew_name:
                crew_name[it['name']] = 1
            else:
                crew_name[it['name']]+=1


# set the appear tims for actors
final_crew = {}
index = 0
for k,v in crew_name.items():
    if v > 0:
        final_crew[k] = index
        index += 1
# print(len(final_crew))

np_crew = np.zeros((len(df2), len(final_crew)))
row = 0
for item in df2['crew']:
    crew = eval(item)
    for it in crew:
        if it['job'] == 'Director':
            if it['name'] in final_crew:
                index = final_crew[it['name']]
                np_crew[row][index] = 1
    row += 1

df_crew = pd.DataFrame(np_crew, columns = list(final_crew.keys()))
            
df3 = pd.concat([df2, df_crew], axis = 1)
# print(df.shape)

In [None]:
df3=df3.drop(['crew'],axis=1)
# #genres
df4 = feature_engineering("genres", df3, "name")
#df4
df4 = df4.drop(['genres'], axis = 1)
#keywords
df5 = feature_engineering('keywords', df4, 'name')
df5 = df5.drop(['keywords'], axis = 1)
#production_companies
df6 = feature_engineering('production_companies',df5,'name')
df6 = df6.drop(['production_companies'],axis=1)
#production_countries
df7 = feature_engineering('production_countries',df6,'name')
df7 = df7.drop(['production_countries'],axis=1)
#spoken_languages
df8 = feature_engineering('spoken_languages',df7,'iso_639_1')
df8 = df8.drop(['spoken_languages'],axis=1)
# drop the columns not used
df_clean = df8.drop(["movie_title","original_title","overview","tagline",'title','original_language','status','release_date','movieName','filmInfo','award'],axis=1)


In [None]:
df_clean.drop_duplicates()
df_clean

In [None]:
df_2017  = df_clean.loc[df_clean['released_year'] != 2018]
df_2018  = df_clean.loc[df_clean['released_year'] == 2018]
#df_2018 = df_2018.drop_duplicates(['movie_id'])
df_2018.head()

In [None]:
# Model Training
# split
X_train = df_2017[df_2017.columns.difference(['winner'])]
y_train = df_2017['winner']
X_test_2018 = df_2018[df_2018.columns.difference(['winner'])]
y_test_2018 = df_2018['winner']
print(X_train)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train_scaled = scaler.transform(X_train)
X_train_scaled
X_test_scaled = scaler.transform(X_test_2018)
X_test_scaled

In the following code, we choose 0.81 as the parameters in PCA(), which means 0.81 variance in the features will be retained.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(0.81)
pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
X_train_pca
X_test_pca

In [None]:
from sklearn.linear_model import LogisticRegression

# all parameters not specified are set to their defaults
# For small datasets, ‘liblinear’ is a good choice
logisticRegr = LogisticRegression(solver = 'liblinear')
logisticRegr.fit(X_train_pca, y_train)

In [None]:
logisticRegr.score(X_train_pca, y_train)

In [None]:
# Predict for One Observation

predicted = logisticRegr.predict(X_test_pca)
df_2018['prediction'] = predicted
print(logisticRegr.predict(X_test_pca))


In [None]:
logisticRegr.score(X_test_pca, y_test_2018)
# print(predicted)

In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test_2018, predicted).ravel()
sensitivity = tp/(tp+fn)
print("True Positives:",tp)
print("False Positives:",fp)
print("True Negatives:",tn)
print("False Negatives:",fn)
print(sensitivity) # 0.84375

In [None]:
print(len(predicted))

In [None]:
df_p2018 = df_2018[['movie_id','winner','prediction']]
#df_p2018

In [None]:
df_prediction = df_p2018.merge(df3, on='movie_id', how='left')
df_prediction  = df_prediction.rename(columns={"winner_x": "winner"})
df_prediction[['movie_title','winner','prediction']]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_2018, predicted))