In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import datetime
import statsmodels.api as sm

In [2]:
def load_sentiment_scores_from_file(file):
    movie_scores_dict = {}
    
    with open(file, 'r') as input_file:
            
            for line in input_file:
                line_list = line.strip().split()
                movie_scores_dict[' '.join(line_list[:-1])] = float(line_list[-1])
                
    return movie_scores_dict

In [3]:
def make_df(movie_file):
    scores_dict = load_sentiment_scores_from_file('intermediates/movie_log')
    df = pd.DataFrame(list(scores_dict.items()), columns=['movie', 'sentiment score'])
    
    df_movies = pd.read_csv(movie_file, parse_dates=['release_date'])
    df_movies = df_movies[['title', 'revenue', 'budget', 'vote_average', 'vote_count']]
    merged = df.merge(df_movies, left_on='movie', right_on='title', how='inner')
    merged = merged[['movie', 'sentiment score', 'budget', 'vote_average', 'vote_count', 'revenue']]
    return merged

In [4]:
def run_regression(x_list, y_str, df):
    X = df[x_list]
    y = df[y_str]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
    predictions = model.predict(X)
    return model.summary()

In [6]:
run_regression(['sentiment score', 'budget', 'vote_average', 'vote_count'], 'revenue', make_df('tmdb/tmdb_5000_movies.csv'))

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.646
Model:,OLS,Adj. R-squared:,0.643
Method:,Least Squares,F-statistic:,226.6
Date:,"Thu, 02 May 2019",Prob (F-statistic):,1.4600000000000002e-110
Time:,00:34:14,Log-Likelihood:,-10256.0
No. Observations:,502,AIC:,20520.0
Df Residuals:,497,BIC:,20540.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.033e+08,8.04e+07,-1.285,0.199,-2.61e+08,5.46e+07
sentiment score,1.026e+08,1.23e+08,0.832,0.406,-1.4e+08,3.45e+08
budget,1.9744,0.171,11.570,0.000,1.639,2.310
vote_average,5.429e+06,1.33e+07,0.409,0.682,-2.06e+07,3.15e+07
vote_count,7.232e+04,5321.050,13.592,0.000,6.19e+04,8.28e+04

0,1,2,3
Omnibus:,271.595,Durbin-Watson:,1.779
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3343.11
Skew:,2.073,Prob(JB):,0.0
Kurtosis:,14.943,Cond. No.,1790000000.0
