Importing the data


In [5]:
import pandas

def parse_file(fileName):
    movies_df = pandas.read_csv(fileName)
    return movies_df.to_dict('records')

movies = parse_file('https://raw.githubusercontent.com/fivethirtyeight/data/master/bechdel/movies.csv')

Exploring the data

In [6]:
type(movies)

list

In [7]:
len(movies)

1794

In [8]:
movies[0]

{'year': 2013,
 'imdb': 'tt1711425',
 'title': '21 &amp; Over',
 'test': 'notalk',
 'clean_test': 'notalk',
 'binary': 'FAIL',
 'budget': 13000000,
 'domgross': 25682380.0,
 'intgross': 42195766.0,
 'code': '2013FAIL',
 'budget_2013$': 13000000,
 'domgross_2013$': 25682380.0,
 'intgross_2013$': 42195766.0,
 'period code': 1.0,
 'decade code': 1.0}

In [9]:
# removing nans
import math
list(filter(lambda movie: math.isnan(movie['domgross_2013$']), movies))

[{'year': 2013,
  'imdb': 'tt2005374',
  'title': 'The Frozen Ground',
  'test': 'nowomen-disagree',
  'clean_test': 'nowomen',
  'binary': 'FAIL',
  'budget': 19200000,
  'domgross': nan,
  'intgross': nan,
  'code': '2013FAIL',
  'budget_2013$': 19200000,
  'domgross_2013$': nan,
  'intgross_2013$': nan,
  'period code': 1.0,
  'decade code': 1.0},
 {'year': 2011,
  'imdb': 'tt1422136',
  'title': 'A Lonely Place to Die',
  'test': 'ok',
  'clean_test': 'ok',
  'binary': 'PASS',
  'budget': 4000000,
  'domgross': nan,
  'intgross': 442550.0,
  'code': '2011PASS',
  'budget_2013$': 4142763,
  'domgross_2013$': nan,
  'intgross_2013$': 458345.0,
  'period code': 1.0,
  'decade code': 1.0},
 {'year': 2011,
  'imdb': 'tt1701990',
  'title': 'Detention',
  'test': 'ok',
  'clean_test': 'ok',
  'binary': 'PASS',
  'budget': 10000000,
  'domgross': nan,
  'intgross': nan,
  'code': '2011PASS',
  'budget_2013$': 10356908,
  'domgross_2013$': nan,
  'intgross_2013$': nan,
  'period code': 1.0

In [10]:
# function to remove all nans
def remove_movies_missing_data(movies):
    return [movie for movie in movies if not math.isnan(movie['domgross_2013$'])]

In [12]:
# check function is working as expected
parsed_movies = remove_movies_missing_data(movies) or []
len(parsed_movies)

1776

In [13]:
# another check if the function is working as expected
list(filter(lambda movie: math.isnan(movie['domgross_2013$']),parsed_movies))

[]

Changing the scales of the values

In [14]:
# too many digits
movies[0]['budget']

13000000

In [15]:
def scale_down_movie(movie):
    movie_copy = dict(movie)
    movie_copy.update({'budget':round(movie['budget']/1000000,2),'budget_2013$':round(movie['budget_2013$']/1000000,2),'domgross':round(movie['domgross']/1000000,2),'domgross_2013$':round(movie['domgross_2013$']/1000000,2),'intgross':round(movie['intgross']/1000000,2),'intgross_2013$':round(movie['intgross_2013$']/1000000,2)})
    return movie_copy  

In [16]:
# test function
scale_down_movie(movies[0])

{'year': 2013,
 'imdb': 'tt1711425',
 'title': '21 &amp; Over',
 'test': 'notalk',
 'clean_test': 'notalk',
 'binary': 'FAIL',
 'budget': 13.0,
 'domgross': 25.68,
 'intgross': 42.2,
 'code': '2013FAIL',
 'budget_2013$': 13.0,
 'domgross_2013$': 25.68,
 'intgross_2013$': 42.2,
 'period code': 1.0,
 'decade code': 1.0}

In [18]:
# apply function to all movies in dict
def scale_down_movies(movies):
    return list(map(lambda movie: scale_down_movie(movie),movies))

first_ten_movies = parsed_movies[0:10]
first_ten_scaled = scale_down_movies(first_ten_movies) or []
first_ten_scaled[-2:]

[{'year': 2013,
  'imdb': 'tt1814621',
  'title': 'Admission',
  'test': 'ok',
  'clean_test': 'ok',
  'binary': 'PASS',
  'budget': 13.0,
  'domgross': 18.01,
  'intgross': 18.01,
  'code': '2013PASS',
  'budget_2013$': 13.0,
  'domgross_2013$': 18.01,
  'intgross_2013$': 18.01,
  'period code': 1.0,
  'decade code': 1.0},
 {'year': 2013,
  'imdb': 'tt1815862',
  'title': 'After Earth',
  'test': 'notalk',
  'clean_test': 'notalk',
  'binary': 'FAIL',
  'budget': 130.0,
  'domgross': 60.52,
  'intgross': 244.37,
  'code': '2013FAIL',
  'budget_2013$': 130.0,
  'domgross_2013$': 60.52,
  'intgross_2013$': 244.37,
  'period code': 1.0,
  'decade code': 1.0}]

In [19]:
# sca;e all
scaled_movies = scale_down_movies(parsed_movies) or []

More exploring of the data

In [38]:
import plotly
from plotly.offline import iplot, init_notebook_mode
plotly.offline.init_notebook_mode(connected=True)

In [36]:
def trace_values(x_values, y_values, mode = 'markers', name="data", text = []):
    return {'x': x_values, 'y': y_values, 'mode': mode, 'name': name, 'text': text}

In [39]:
def plot(traces, layout = {}):
    if not isinstance(traces, list): raise TypeError('first argument must be a list.  Instead is', traces)
    plotly.offline.iplot({'data': traces, 'layout': layout})

In [40]:
# plotting the revenues data agaisnt the movie budget
budgets = list(map(lambda movie: movie['budget_2013$'], scaled_movies))
domestic_revenues = list(map(lambda movie: movie['domgross_2013$'], scaled_movies))
titles = list(map(lambda movie: movie['title'], scaled_movies))

revenues_per_budgets_trace = trace_values(budgets, domestic_revenues, text = titles)

In [41]:
plot([revenues_per_budgets_trace])

In [42]:
# find highest grossing movie
def highest_domestic_gross(movies):
    return max(movies, key=lambda movie: movie['domgross_2013$'])

max_movie = highest_domestic_gross(scaled_movies) or {'title': 'some non movie'}
max_movie['title']

'Star Wars'

In [45]:
# function to build layout and then plot
def build_layout(x_range = None, y_range = None, options = {}):
    layout = {}
    if isinstance(x_range, list): layout.update({'xaxis': {'range': x_range}})
    if isinstance(y_range, list): layout.update({'yaxis': {'range': y_range}})
    layout.update(options)
    return layout

revenues_per_budgets_trace = trace_values(budgets, domestic_revenues, text = titles)
revenues_layout = build_layout(x_range = [0, 300], y_range = [0, 1000])
plot([revenues_per_budgets_trace], revenues_layout)

Building models - model one

In [47]:
# model one will be r(x) = 1.5 * budget + 10
def model_one(budget):
    return 1.5*budget + 10

In [54]:
model_one_revenues = list(map(lambda budget: model_one(budget),budgets))
model_one_revenues_trace = trace_values(budgets, model_one_revenues, mode='lines', name = 'model one')

In [55]:
# set up model plot
def m_b_trace(m, b, x_values, mode = 'line', name = 'line function'):
    values = m_b_data(m, b, x_values)
    values.update({'mode': mode, 'name': name})
    return values

In [56]:
# plot model one 
plot([revenues_per_budgets_trace, model_one_revenues_trace], revenues_layout)

In [59]:
# find error of model one
def error_for_model_one(movie):
    expected = model_one(movie['budget_2013$'])
    return movie['domgross_2013$'] - expected

In [60]:
american_hustle = {'binary': 'PASS', 'budget': 40.0, 'budget_2013$': 40.0, 'clean_test': 'ok',
         'code': '2013PASS', 'decade code': 1.0, 'domgross': 148.43, 'domgross_2013$': 148.43, 'imdb': 'tt1800241',
         'intgross': 249.48, 'intgross_2013$': 249.48, 'period code': 1.0, 'test': 'ok-disagree',
         'title': 'American Hustle', 'year': 2013}
error_for_model_one(american_hustle)

78.43

In [62]:
# calculate RSS
def rss_model_one(movies):
    return round(sum(list(map(lambda movie: error_for_model_one(movie)**2, movies))), 2)

rss_model_one(scaled_movies)

23234357.68

Building models - model two

In [64]:
# model two looks at the effect of years of movie revenue
years = list(map(lambda movie: movie['year'],movies))
years_and_revenues = trace_values(years, domestic_revenues, text = titles)
years_layout = build_layout(y_range = [0, 550])
plot([years_and_revenues], years_layout)

In [67]:
# prediction is that a revenue is 1.5m for every year after 1965 plus 1.1 times the movie budget
def model_two(budget, year):
    return 1.1*budget + 1.5*(year - 1965)

model_two(25, 1997)

75.5

In [72]:
budgets = list(map(lambda movie: movie['budget_2013$'], scaled_movies))
domestic_revenues = list(map(lambda movie: movie['domgross_2013$'], scaled_movies))
titles = list(map(lambda movie: movie['title'], scaled_movies))

model_two_estimated_revenues = list(map(lambda movie: model_two(movie['budget_2013$'], movie['year']),scaled_movies))
model_two_estimated_trace = trace_values(budgets, model_two_estimated_revenues, mode='markers', name = 'model_two')

plot([revenues_per_budgets_trace, model_one_revenues_trace, model_two_estimated_trace], revenues_layout)

In [74]:
# find squared errors and RSS
def squared_error_model_two(movie):
    actual = movie['domgross_2013$']
    expected = model_two(movie['budget_2013$'],movie['year'])
    return (actual - expected)**2

def rss_model_two(movies):
    squared_errors = list(map(lambda movie: squared_error_model_two(movie),movies))
    return round(sum(squared_errors),2)

In [77]:
# comparing the RSS of both models, model one was more accurate
rss_model_two(scaled_movies)

25364329.23

In [78]:
rss_model_one(scaled_movies)

23234357.68

Creating and improving a regression line

In [80]:
# function to build a regression line
def build_regression_line(x_values, y_values):
    sorted_values = sorted_points(x_values, y_values)
    highest = sorted_values[-1]
    lowest = sorted_values[0]
    m = slope(x_values, y_values)
    b = y_intercept(x_values, y_values, m)
    return {'m': m, 'b': b}

budgets = list(map(lambda movie: movie['budget_2013$'], scaled_movies)) or [1, 2]
domestic_revenues = list(map(lambda movie: movie['domgross_2013$'], scaled_movies)) or [1, 2]

In [83]:
# initial line to test
initial_regression_line = {'b': 0.5, 'm': 1.79}

def expected_revenue_per_budget(budget):
    return round(budget*initial_regression_line['m'] + initial_regression_line['b'],1)

budget = american_hustle['budget_2013$']
expected_revenue_per_budget(budget)

72.1

In [85]:
# plot initial line to see accuracy
budgets = list(map(lambda movie: movie['budget_2013$'], scaled_movies))
estimated_revenues = list(map(lambda budget: expected_revenue_per_budget(budget), budgets))

initial_regression_trace = trace_values(budgets, estimated_revenues, mode = 'lines', name = 'initial regression trace')
plot([revenues_per_budgets_trace, initial_regression_trace], revenues_layout)

In [86]:
# find error
def regression_revenue_error(m, b, movie):
    expected = (m*movie['budget_2013$'] + b)
    actual = movie['domgross_2013$']
    return round(actual - expected,2)

regression_revenue_error(initial_regression_line['m'], initial_regression_line['b'], american_hustle)

76.33

In [97]:
# functions
def y_actual(x, x_values, y_values):
    combined_values = list(zip(x_values, y_values))
    point_at_x = list(filter(lambda point: point[0] == x,combined_values))[0]
    return point_at_x[1]

def error(x_values, y_values, m, b, x):
    expected = (m*x + b)
    return (y_actual(x, x_values, y_values) - expected)

def squared_error(x_values, y_values, m, b, x):
    return error(x_values, y_values, m, b, x)**2

def squared_errors(x_values, y_values, m, b):
    return list(map(lambda x: squared_error(x_values, y_values, m, b, x), x_values))

def residual_sum_squares(x_values, y_values, m, b):
    return round(sum(squared_errors(x_values, y_values, m, b)), 2)

In [98]:
residual_sum_squares(budgets, domestic_revenues, initial_regression_line['m'], initial_regression_line['b'])

24179823.79

In [102]:
# plot cost curve from 1.0 to 1.9, showing that lowest error is between 1.3 and 1.4
large_m_range = list(range(10, 20))
m_range = list(map(lambda m_value: m_value/10,large_m_range))

cost_values = list(map(lambda m_value: round(residual_sum_squares(budgets, domestic_revenues, m_value, initial_regression_line['b']), 2),m_range))

rss_trace = trace_values(x_values=m_range, y_values=cost_values, mode = 'lines')
plot([rss_trace])

In [103]:
# since error is minimised at ~1.3, test RSS with this value and it does decline
residual_sum_squares(budgets, domestic_revenues, 1.3, initial_regression_line['b'])

22066076.55

Changing multiple variables

In [105]:
# function to return gradient of b
def b_gradient(m, b, movies):
    n = len(movies)
    errors = list(map(lambda movie: regression_revenue_error(m, b, movie), movies))
    return round(-1 * sum(errors)/n, 2)

b_gradient(1.79, 0.50, scaled_movies)

5.37

In [107]:
# function to return gradient of m
def m_gradient(m, b, movies):
    n = len(movies)
    errors_times_x = list(map(lambda movie: regression_revenue_error(m, b, movie)*movie['budget_2013$'], movies))
    return round(-1 * sum(errors_times_x)/n, 2)

m_gradient(1.79, 0.50, scaled_movies)

2520.59

In [109]:
# step gradient along the cost curve
def step_gradient(b_current, m_current, movies, learning_rate):
    b_change = b_gradient(m_current, b_current, movies)
    m_change = m_gradient(m_current, b_current, movies) 
    new_b = round(b_current - (learning_rate * b_change), 2)
    new_m = round(m_current - (learning_rate * m_change), 2)
    return {'b': new_b, 'm': new_m}

step_gradient(initial_regression_line['b'], initial_regression_line['m'], scaled_movies, .0001)

{'b': 0.5, 'm': 1.54}

In [112]:
# function given a set of iterations
def generate_steps(m, b, number_of_steps, movies, learning_rate):
    iterations = []
    for i in range(number_of_steps):
        iteration = step_gradient(b, m, movies, learning_rate)
        b = iteration['b']
        m = iteration['m']
        iterations.append(iteration)
    return iterations

iterations = generate_steps(initial_regression_line['m'], initial_regression_line['b'], 100, scaled_movies, .0001) or [{'m': 'uncomment generate_steps method', 'b': 'uncomment generate_steps method '}]

In [115]:
# how this changes over time and plot 
def to_line(m, b):
    initial_x = 0
    ending_x = 500
    initial_y = m*initial_x + b
    ending_y = m*ending_x + b
    return {'data': [{'x': [initial_x, ending_x], 'y': [initial_y, ending_y]}]}

frames = list(map(lambda iteration: to_line(iteration['m'], iteration['b']), iterations))

budgets = list(map(lambda movie: movie['budget_2013$'], scaled_movies))
domestic_revenues = list(map(lambda movie: movie['domgross_2013$'], scaled_movies))

figure = {'data': [{'x': [0], 'y': [0]}, {'x': budgets, 'y': domestic_revenues, 'mode': 'markers'}],
          'layout': {'title': 'Regression Line',
                     'updatemenus': [{'type': 'buttons',
                                      'buttons': [{'label': 'Play',
                                                   'method': 'animate',
                                                   'args': [None]}]}]
                    },
          'frames': frames}
iplot(figure)