# Casusopdracht 


## Inhoudopgave



## Inleiding

In [46]:
import pandas as pd
import numpy as np
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from itertools import combinations
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score


## 1. Modeling

In [47]:
df = pd.read_csv("vraag_1_movies.csv")

### 1.1 Baseline model

In [48]:
def baseline(X):
    return np.full(len(X), fill_value=np.mean(df["gross"]))

root_mean_squared_error(df["gross"], baseline(df["gross"]))

67968859.53127213

In [49]:
def procmse(X, x):
    return [np.sqrt((I-i)**2)/I for I, i in zip(X,x)]


np.percentile(procmse(df["gross"], baseline(df["gross"])), [25, 50, 75, 99])

array([4.25415462e-01, 9.08541204e-01, 8.10970390e+00, 5.81224484e+03])

In [50]:
np.mean(procmse(df["gross"], baseline(df["gross"])))

353.35401507347353

50% van de gokken zitten onder 91% ernaast

### 1.2 Data splitsen

In [51]:

X = df.drop("gross", axis=1)
y = df["gross"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)


### 1.3 Model keuze

We werken hier met een continue target variabele en met kwantitatieve feature variabelen om deze reden gaan we werken met een lineare regressie model.

In [52]:

model = LinearRegression()


### 1.4 model trainen

In [53]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


### 1.5 evaluatie

In [54]:
print(f"mrse: {root_mean_squared_error(y_pred=y_pred, y_true=y_test)}")
print(f"r2 score at {r2_score(y_test, y_pred)}")

mrse: 54495229.205467544
r2 score at 0.45713382109378764


In [55]:
np.percentile(procmse(y_test, y_pred), [25, 50, 75, 99])

array([3.61672493e-01, 6.82578554e-01, 3.18012988e+00, 1.83514380e+03])

In [56]:
np.mean(procmse(y_test, y_pred))

90.21623269630646

### 1.6 finetuning

In [57]:
df

Unnamed: 0,gross,imdb_score,num_critic_for_reviews,num_voted_users,num_user_for_reviews,movie_facebook_likes
0,760505847.0,7.9,723.0,886204,3054.0,33000
1,309404152.0,7.1,302.0,471220,1238.0,0
2,200074175.0,6.8,602.0,275868,994.0,85000
3,448130642.0,8.5,813.0,1144337,2701.0,164000
4,73058679.0,6.6,462.0,212204,738.0,24000
...,...,...,...,...,...,...
4116,70071.0,6.3,35.0,589,35.0,74
4117,2040920.0,6.9,56.0,52055,130.0,0
4118,4584.0,6.4,14.0,1338,14.0,413
4119,10443.0,6.3,14.0,1255,9.0,660


In [58]:


all_combinations = []
columns = X.columns
for r in range(1, len(columns) + 1):
    all_combinations.extend(combinations(columns, r))

best_r2 = -20
best_mrse= 99999999999999999
for combo in all_combinations:
    combo = list(combo)
    X_train, X_test, y_train, y_test = train_test_split(df[combo], df['gross'], test_size=0.20, random_state=42)

    model = LinearRegression()
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    mse = root_mean_squared_error(y_test, y_pred)

    if mse < best_mrse:
        best_mrse = mse
        best_mrse_combo = combo
        best_model = model
    if r2_score(y_test, y_pred) > best_r2:
        best_r2 = r2_score(y_test, y_pred)
        best_r2_combo = combo

print(f"{best_mrse_combo} has the best mrse at {best_mrse}")
print(f"{best_r2_combo} has the best r2 score at {best_r2}")

['imdb_score', 'num_voted_users', 'num_user_for_reviews'] has the best mrse at 53186030.24448845
['imdb_score', 'num_voted_users', 'num_user_for_reviews'] has the best r2 score at 0.471732387731122
