In [1]:
%load_ext autoreload
%autoreload 2

In [49]:
import pandas as pd
import numpy as np
import streamlit as st
import pickle

from sklearn.model_selection import train_test_split
from python_files.data import GetData
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression


In [5]:
data = GetData().get_data()['AllDataMerged_updated']
data.head()

Unnamed: 0,id,budget,genres,imdb_id,original_language,production_companies,production_countries,release_date,revenue,runtime,...,actor5_name,actor5_gender,actor_number,director_name,director_gender,director_number,producer_name,producer_number,screeplay_name,editor_name
0,5,4000000,Crime|Comedy,tt0113101,en,Miramax Films,United States of America,25-12-1995,4300000,98.0,...,Marisa Tomei,1,24,Allison Anders,1.0,4,Lawrence Bender,1,none,Margaret Goodspeed
1,9,4000000,Drama,tt0425473,de,none,Germany,02-09-2004,4257354,15.0,...,none,0,2,Marc Meyer,0.0,2,Marc Meyer,1,none,Marc Meyer
2,11,11000000,Adventure|Action|Science Fiction,tt0076759,en,Lucasfilm,United States of America,25-05-1977,775398007,121.0,...,Alec Guinness,2,106,George Lucas,2.0,1,Gary Kurtz,2,none,Marcia Lucas
3,12,94000000,Animation|Family,tt0266543,en,Pixar Animation Studios,United States of America,30-05-2003,940335536,100.0,...,Brad Garrett,2,24,Andrew Stanton,2.0,1,Graham Walters,1,Andrew Stanton,David Ian Salter
4,13,55000000,Comedy|Drama|Romance,tt0109830,en,Paramount Pictures,United States of America,06-07-1994,677945399,142.0,...,Sally Field,1,67,Robert Zemeckis,2.0,1,Wendy Finerman,3,Eric Roth,Arthur Schmidt


In [6]:
df = data.copy()

In [7]:
df.shape

(8939, 35)

In [8]:
# Apply log feature to budget and revnue

df['budget_log'] = np.log(df['budget'])
df['revenue_log'] = np.log(df['revenue'])

In [16]:
# Define X and Y

X = df[['budget_log','runtime','production_companies_number','production_countries_number',
       'spoken_languages_number','director_number','producer_number','actor_number']]
y = df['revenue_log']

In [38]:
y.head()

0    15.274126
1    15.264158
2    20.468887
3    20.661747
4    20.334577
Name: revenue_log, dtype: float64

In [17]:
# Split data
X_tain, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [18]:
X_test.shape

(2682, 8)

In [19]:
# Split test data
X_test, X_pred, y_test, y_pred = train_test_split(X_test, y_test, test_size=0.05, random_state=1)

In [20]:
X_pred.shape

(135, 8)

In [27]:
# Build pipeline

pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('robust_scaler', RobustScaler())])
pipeline

In [28]:
# Fit and train

X_train_transformed = pipeline.fit_transform(X_tain)
X_test_trainsformed = pipeline.transform(X_test)

In [33]:
pd.DataFrame(X_test_trainsformed)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.692635,0.038462,-0.5,0.0,0.0,0.0,-0.5,-0.307692
1,0.548900,-0.346154,-1.0,0.0,0.0,0.0,0.0,-0.692308
2,-0.494087,-3.884615,0.0,0.0,0.0,0.0,-0.5,0.615385
3,-0.457807,-0.384615,-1.0,-1.0,0.0,0.0,-0.5,-0.076923
4,-0.771879,-0.269231,-0.5,0.0,0.0,0.0,-0.5,-0.615385
...,...,...,...,...,...,...,...,...
2542,-2.657457,-0.307692,-1.0,0.0,0.0,0.0,0.5,-0.307692
2543,0.700893,1.192308,0.5,0.0,0.0,0.0,2.0,1.769231
2544,0.346317,-0.423077,-1.0,0.0,1.0,0.0,-0.5,-0.615385
2545,0.804124,-0.230769,-0.5,2.0,0.0,0.0,-0.5,-0.461538


In [35]:
# Instantiate and train model

lin_model = LinearRegression()
lin_model.fit(X_train_transformed,y_train)

In [36]:
# Score the model
lin_model.score(X_test_trainsformed,y_test)

0.42099205379310556

In [45]:
# Predict model
y_pred_log = lin_model.predict(X_pred)
pd.DataFrame(y_pred_log).head()



Unnamed: 0,0
0,57.80471
1,49.607404
2,54.975272
3,60.437755
4,51.363009


In [44]:
# Convert log value to normal
y_pred = np.exp(y_pred_log)
pd.DataFrame(y_pred).head()

Unnamed: 0,0
0,1.271354e+25
1,3.501239e+21
2,7.506843e+23
3,1.769227e+26
4,2.026149e+22


In [53]:
X_pred.columns

Index(['budget_log', 'runtime', 'production_companies_number',
       'production_countries_number', 'spoken_languages_number',
       'director_number', 'producer_number', 'actor_number'],
      dtype='object')

In [50]:
# Save model
filename = 'xiaohu_predict_model.sav'
pickle.dump(lin_model, open(filename, 'wb'))

In [54]:
X_pred.drop(columns='budget_log').to_csv('X_pred.csv')

In [61]:
pd.DataFrame(X_pred)['budget_log']

3406    17.727534
8464    15.250595
1711    17.216708
1571    16.648724
1664    18.064006
          ...    
858     18.651792
433     16.523561
1510    17.147715
2217    16.380460
3649    17.622173
Name: budget_log, Length: 135, dtype: float64