In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import datetime
import statsmodels.api as sm

In [2]:
def load_sentiment_scores_from_file(file):
    movie_scores_dict = {}
    
    with open(file, 'r') as input_file:
            
            for line in input_file:
                line_list = line.strip().split()
                movie_scores_dict[' '.join(line_list[:-1])] = float(line_list[-1])
                
    return movie_scores_dict

In [3]:
def make_df(movie_file):
    scores_dict = load_sentiment_scores_from_file('intermediates/movie_log')
    df = pd.DataFrame(list(scores_dict.items()), columns=['movie', 'sentiment score'])
    
    df_movies = pd.read_csv(movie_file, parse_dates=['release_date'])
    df_movies = df_movies[['title', 'revenue', 'budget', 'vote_average', 'vote_count']]
    merged = df.merge(df_movies, left_on='movie', right_on='title', how='inner')
    merged = merged[['movie', 'sentiment score', 'budget', 'vote_average', 'vote_count', 'revenue']]
    return merged

In [4]:
def run_regression(x_list, y_str, df):
    X = df[x_list]
    y = df[y_str]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
    predictions = model.predict(X)
    return model.summary()

In [8]:
run_regression(['sentiment score', 'budget', 'vote_average', 'vote_count'], 'revenue', make_df('tmdb/tmdb_5000_movies.csv'))

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.645
Model:,OLS,Adj. R-squared:,0.642
Method:,Least Squares,F-statistic:,225.4
Date:,"Thu, 02 May 2019",Prob (F-statistic):,4.4000000000000005e-110
Time:,18:54:14,Log-Likelihood:,-10236.0
No. Observations:,501,AIC:,20480.0
Df Residuals:,496,BIC:,20500.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.587e+07,8.15e+07,-1.176,0.240,-2.56e+08,6.43e+07
sentiment score,-1.795e+07,7.52e+07,-0.239,0.811,-1.66e+08,1.3e+08
budget,1.9870,0.171,11.618,0.000,1.651,2.323
vote_average,7.224e+06,1.33e+07,0.543,0.587,-1.89e+07,3.34e+07
vote_count,7.197e+04,5324.830,13.516,0.000,6.15e+04,8.24e+04

0,1,2,3
Omnibus:,272.812,Durbin-Watson:,1.779
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3435.745
Skew:,2.082,Prob(JB):,0.0
Kurtosis:,15.135,Cond. No.,1240000000.0
