# Worldwide Gross Models

## Import libraries

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

## Getting the data

In [12]:
df_movies = pd.read_csv('../data/processed/pixar_movies_complete.csv')
df_movies

Unnamed: 0,Year Released,Movie,Length,RT Score,IMDB Score,Metacritic Score,Opening Weekend,Worldwide Gross,Domestic Gross,International Gross,Domestic %,International %,Production Budget,Oscars Nominated,Oscars Won
0,1995,Toy Story,81,100,8.3,92,29.14,362.0,191.8,170.2,52.98,47.02,30,3.0,0.0
1,1998,A Bug's Life,96,92,7.2,77,33.26,363.4,162.8,200.6,44.8,55.2,45,1.0,0.0
2,1999,Toy Story 2,92,100,7.9,88,57.39,485.0,245.9,239.2,50.7,49.32,90,1.0,0.0
3,2001,"Monsters, Inc.",90,96,8.1,78,62.58,528.8,255.9,272.9,48.39,51.61,115,3.0,1.0
4,2003,Finding Nemo,104,99,8.2,90,70.25,895.6,339.7,555.9,37.93,62.07,94,4.0,1.0
5,2004,The Incredibles,115,97,8.0,90,70.47,631.4,261.4,370.0,41.4,58.6,92,4.0,2.0
6,2006,Cars,116,74,7.2,73,60.12,462.0,244.1,217.9,52.84,47.16,70,2.0,0.0
7,2007,Ratatouille,111,96,8.0,96,47.0,623.7,206.4,417.3,33.09,66.91,150,5.0,1.0
8,2008,WALL-E,97,96,8.4,94,63.1,521.3,223.8,297.5,42.93,57.07,180,6.0,1.0
9,2009,Up,96,98,8.3,88,68.11,731.3,293.0,438.3,40.07,59.93,175,5.0,2.0


## Droping columns

In [13]:
df_movies.drop(['Domestic Gross', 'International Gross', 'Domestic %', 'International %', 'Oscars Nominated', 'Oscars Won', 'Movie'], axis=1, inplace=True)
df_movies

Unnamed: 0,Year Released,Length,RT Score,IMDB Score,Metacritic Score,Opening Weekend,Worldwide Gross,Production Budget
0,1995,81,100,8.3,92,29.14,362.0,30
1,1998,96,92,7.2,77,33.26,363.4,45
2,1999,92,100,7.9,88,57.39,485.0,90
3,2001,90,96,8.1,78,62.58,528.8,115
4,2003,104,99,8.2,90,70.25,895.6,94
5,2004,115,97,8.0,90,70.47,631.4,92
6,2006,116,74,7.2,73,60.12,462.0,70
7,2007,111,96,8.0,96,47.0,623.7,150
8,2008,97,96,8.4,94,63.1,521.3,180
9,2009,96,98,8.3,88,68.11,731.3,175


## Model with Raw DataFrame

In [14]:
X = df_movies.drop('Worldwide Gross', axis=1).values
y = df_movies['Worldwide Gross'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [15]:
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)

X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [17]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('r2: ', r2)
print('mse: ', mse)

r2:  0.7674652809664627
mse:  0.15881664427556952
