In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from config import *
import pandas as pd

data = pd.read_csv(DATA_FILE)
data

In [None]:
df = data[['year', 'age_restriction', 'duration', 'imdb_rating', 'imdb_count_scores', 'genres', 'users_count_reviews', 'metascore', 'budget', 'gross_worldwide']]
df

In [None]:
df = df.dropna()
df

In [None]:
df = df.astype({"year": int})

In [None]:
import re

def time_handle(x: str):
    pattern = r"(\d+)h (\d+)m"
    result = re.search(pattern, x)
    
    if result:
        h = int(result.group(1))
        m = int(result.group(2))
        return h * 60 + m
    else:
        return None
    
def number_handle(x: str):
    mil_result = re.search(r"(\d+)M", x)
    if mil_result:
        return int(float(mil_result.group(1)) * 1_000_000)
    
    k_result = re.search(r"(\d+)K", x)
    if k_result:
        return int(float(k_result.group(1)) * 1_000)
    return int(x)

def budget_handle(x: str):
    x = x.replace(',', '')
    match = re.search(r"(\d+)", x)
    return int(match.group(1))


df.duration = df.duration.map(time_handle)
df.imdb_count_scores = df.imdb_count_scores.map(number_handle)
df.users_count_reviews = df.users_count_reviews.map(number_handle)
df.budget = df.budget.map(budget_handle)
df.gross_worldwide = df.gross_worldwide.map(budget_handle)
df.genres = df.genres.map(lambda x: x[2:-2].split("', '"))
df

In [None]:
df.info()

In [None]:
cont_col = ['year', 'duration', 'imdb_rating', 'imdb_count_scores', 'users_count_reviews', 'metascore', 'budget']

In [None]:
from sklearn.preprocessing import StandardScaler
df[cont_col] = StandardScaler().fit_transform(df[cont_col])
df

In [None]:
df = df.reset_index()
df = df.drop('index', axis=1)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

genres_one_hot = mlb.fit_transform(df.pop('genres'))
df = pd.concat([df, pd.DataFrame(genres_one_hot, columns=mlb.classes_)], axis=1)
df

In [None]:
from sklearn.preprocessing import OneHotEncoder

onehotencoder = OneHotEncoder(categories='auto', sparse_output=False)
df_new = onehotencoder.fit_transform(np.array(df['age_restriction']).reshape(-1, 1))
cat = onehotencoder.categories_
df = df.drop(columns=['age_restriction'])
df = pd.concat([df, pd.DataFrame(df_new, columns=cat[0])], axis=1)

df

In [None]:
df['duration'] = df['duration'].fillna(df['duration'].dropna().mean())

In [None]:
X = df.drop(columns=['gross_worldwide'])
Y = df['gross_worldwide']

In [None]:
from sklearn.model_selection import train_test_split

XX, X_test, YY, Y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8)
X_train, X_val, Y_train, Y_val = train_test_split(XX, YY, test_size = 0.2, train_size =0.8)

In [None]:
from sklearn.metrics import mean_squared_error as mse

from sklearn.neighbors import KNeighborsRegressor

coeffs = list(range(1, 50, 5))
err_train = []
err_val = []
err_test = []
for c in coeffs:
    knn_regressor = KNeighborsRegressor(n_neighbors=c)
    knn_regressor.fit(X_train, Y_train)
    train_predict = knn_regressor.predict(X_train)
    val_predict = knn_regressor.predict(X_val)
    test_predict = knn_regressor.predict(X_test)
    
    err_train.append(mse(train_predict, Y_train, squared=False))
    err_val.append(mse(val_predict, Y_val, squared=False))
    err_test.append(mse(test_predict, Y_test, squared=False))

In [None]:
plt.plot(coeffs, err_train, label='train data')
plt.plot(coeffs, err_val, label='valid data')
plt.plot(coeffs, err_test, label='test data')
plt.legend()
plt.show()

In [None]:
from sklearn.linear_model import Lasso

coeffs = list(range(0, 50))
err_train = []
err_val = []
err_test = []
for c in coeffs:
    lasso = Lasso(alpha=c / 10)
    lasso.fit(X_train, Y_train)
    train_predict = lasso.predict(X_train)
    val_predict = lasso.predict(X_val)
    test_predict = lasso.predict(X_test)
    
    err_train.append(mse(train_predict, Y_train, squared=False))
    err_val.append(mse(val_predict, Y_val, squared=False))
    err_test.append(mse(test_predict, Y_test, squared=False))

In [None]:
plt.plot(np.array(coeffs) / 10, err_train, label='train data')
plt.plot(np.array(coeffs) / 10, err_val, label='valid data')
plt.plot(np.array(coeffs) / 10, err_test, label='test data')
plt.legend()
plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor

coeffs = list(range(1, 50))
err_train = []
err_val = []
err_test = []
for c in coeffs:
    tree = DecisionTreeRegressor(max_depth=c)
    tree.fit(X_train, Y_train)
    err_train.append(mse(tree.predict(X_train), Y_train, squared=False))
    err_val.append(mse(tree.predict(X_val), Y_val, squared=False))
    err_test.append(mse(tree.predict(X_test), Y_test, squared=False))

In [None]:
plt.plot(coeffs, err_train, label='train data')
plt.plot(coeffs, err_val, label='valid data')
plt.plot(coeffs, err_test, label='test data')
plt.legend()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

coeffs = list(range(1, 50))
err_train = []
err_val = []
err_test = []
for c in coeffs:
    forest = RandomForestRegressor(n_estimators=c)
    forest.fit(X_train, Y_train)
    err_train.append(mse(forest.predict(X_train), Y_train, squared=False))
    err_val.append(mse(forest.predict(X_val), Y_val, squared=False))
    err_test.append(mse(forest.predict(X_test), Y_test, squared=False))

In [None]:
plt.plot(coeffs, err_train, label='train data')
plt.plot(coeffs, err_val, label='valid data')
plt.plot(coeffs, err_test, label='test data')
plt.legend()
plt.show()