In [53]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import datetime
import statsmodels.api as sm

In [54]:
def load_sentiment_scores_from_file(file):
    movie_scores_dict = {}
    
    with open(file, 'r') as input_file:
            
            for line in input_file:
                line_list = line.strip().split()
                movie_scores_dict[' '.join(line_list[:-1])] = float(line_list[-1])
                
    return movie_scores_dict

In [55]:
def make_df(movie_file):
    scores_dict = load_sentiment_scores_from_file('intermediates/movie_log')
    df = pd.DataFrame(list(scores_dict.items()), columns=['movie', 'sentiment score'])
    
    df_movies = pd.read_csv(movie_file, parse_dates=['release_date'])
    df_movies = df_movies[['title', 'revenue', 'budget', 'vote_average', 'vote_count']]
    merged = df.merge(df_movies, left_on='movie', right_on='title', how='inner')
    merged = merged[['movie', 'sentiment score', 'budget', 'vote_average', 'vote_count', 'revenue']]
    return merged

In [56]:
def run_regression(x_list, y_str, df):
    X = df[x_list]
    y = df[y_str]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
    predictions = model.predict(X)
    return model.summary()

In [57]:
run_regression(['sentiment score', 'budget', 'vote_average', 'vote_count'], 'revenue', make_df('tmdb/tmdb_5000_movies.csv'))

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.503
Model:,OLS,Adj. R-squared:,0.482
Method:,Least Squares,F-statistic:,24.27
Date:,"Wed, 01 May 2019",Prob (F-statistic):,6.83e-14
Time:,22:41:30,Log-Likelihood:,-2107.3
No. Observations:,101,AIC:,4225.0
Df Residuals:,96,BIC:,4238.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.519e+07,3.08e+08,-0.244,0.808,-6.86e+08,5.36e+08
sentiment score,7.372e+07,3.92e+08,0.188,0.851,-7.05e+08,8.52e+08
budget,2.2304,0.634,3.519,0.001,0.972,3.489
vote_average,-1.47e+07,4.93e+07,-0.298,0.766,-1.13e+08,8.32e+07
vote_count,8.855e+04,1.42e+04,6.215,0.000,6.03e+04,1.17e+05

0,1,2,3
Omnibus:,47.535,Durbin-Watson:,1.61
Prob(Omnibus):,0.0,Jarque-Bera (JB):,164.685
Skew:,1.587,Prob(JB):,1.73e-36
Kurtosis:,8.39,Cond. No.,2710000000.0
