In [33]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import datetime
import statsmodels.api as sm

  from pandas.core import datetools


In [18]:
def load_sentiment_scores_from_file(file):
    movie_scores_dict = {}
    
    with open(file, 'r') as input_file:
            
            for line in input_file:
                line_list = line.strip().split()
                movie_scores_dict[' '.join(line_list[:-1])] = float(line_list[-1])
                
    return movie_scores_dict

In [45]:
def make_df(movie_file):
    scores_dict = load_sentiment_scores_from_file('intermediates/movie_log')
    df = pd.DataFrame(list(scores_dict.items()), columns=['movie', 'sentiment score'])
    
    df_movies = pd.read_csv(movie_file, parse_dates=['release_date'])
    df_movies = df_movies[['title', 'revenue', 'budget', 'vote_average', 'vote_count']]
    merged = df.merge(df_movies, left_on='movie', right_on='title', how='inner')
    merged = merged[['movie', 'sentiment score', 'budget', 'vote_average', 'vote_count', 'revenue']]
    return merged

In [46]:
def run_regression(x_list, y_str, df):
    X = df[x_list]
    y = df[y_str]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
    predictions = model.predict(X)
    return model.summary()

In [47]:
run_regression(['sentiment score', 'budget', 'vote_average'], 'revenue', make_df('tmdb/tmdb_5000_movies.csv'))

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.26
Model:,OLS,Adj. R-squared:,0.213
Method:,Least Squares,F-statistic:,5.518
Date:,"Wed, 01 May 2019",Prob (F-statistic):,0.00249
Time:,22:23:06,Log-Likelihood:,-1082.3
No. Observations:,51,AIC:,2173.0
Df Residuals:,47,BIC:,2180.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.812e+09,6.39e+08,-2.835,0.007,-3.1e+09,-5.27e+08
sentiment score,-1.514e+09,1.42e+09,-1.066,0.292,-4.37e+09,1.34e+09
budget,1.8908,1.366,1.384,0.173,-0.857,4.639
vote_average,3.502e+08,9.93e+07,3.528,0.001,1.5e+08,5.5e+08

0,1,2,3
Omnibus:,32.212,Durbin-Watson:,1.597
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.806
Skew:,1.762,Prob(JB):,1.72e-18
Kurtosis:,8.107,Cond. No.,5380000000.0
