In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from sklearn.preprocessing import PowerTransformer

In [2]:
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6716 entries, 0 to 6715
Data columns (total 62 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   popularity               6716 non-null   float64
 1   budget                   6716 non-null   float64
 2   revenue                  6716 non-null   float64
 3   runtime                  6716 non-null   float64
 4   vote_average             6716 non-null   float64
 5   vote_count               6716 non-null   float64
 6   profit_margin            6716 non-null   float64
 7   overview: life           6716 non-null   int64  
 8   overview: new            6716 non-null   int64  
 9   overview: young          6716 non-null   int64  
 10  overview: world          6716 non-null   int64  
 11  overview: man            6716 non-null   int64  
 12  overview: family         6716 non-null   int64  
 13  overview: love           6716 non-null   int64  
 14  overview: woman         

In [4]:
X_train = train_df.drop(columns='profit_margin')
y_train = train_df['profit_margin']
X_validation = validation_df.drop(columns='profit_margin')
y_validation = validation_df['profit_margin']
X_test = test_df.drop(columns='profit_margin')
y_test = test_df['profit_margin']

In [5]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [6]:
y_validation_pred = knn.predict(X_validation)

In [7]:
mean_squared_error(y_validation, y_validation_pred, squared=False)

0.687925768617416

In [8]:
mean_absolute_error(y_validation, y_validation_pred)

0.5452157531642169

In [9]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_validation_pca = pca.fit_transform(X_validation)
X_test_pca = pca.fit_transform(X_test)

In [10]:
# 85-95% of the data
# < 10 components
# other: polynomial transform, cyclical transform if data/time, t-sne, nmf
knn_pca = KNeighborsRegressor()
knn_pca.fit(X_train_pca, y_train)
y_validation_pca_pred = knn_pca.predict(X_validation_pca)

In [11]:
mean_squared_error(y_validation, y_validation_pca_pred, squared=False)

0.8955627826665833

In [12]:
mean_absolute_error(y_validation, y_validation_pca_pred)

0.7447418089087172

In [13]:
poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train)
X_validation_poly = poly.fit_transform(X_validation)
X_test_poly = poly.fit_transform(X_test)

In [14]:
knn_poly = KNeighborsRegressor()
knn_poly.fit(X_train_poly, y_train)
y_validation_poly_pred = knn_poly.predict(X_validation_poly)

In [15]:
mean_squared_error(y_validation, y_validation_poly_pred, squared=False)

0.7027693634506673

In [16]:
mean_absolute_error(y_validation, y_validation_poly_pred)

0.5611119388420843

In [17]:
nmf = NMF(n_components=2)
X_train_nmf = nmf.fit_transform(X_train)
X_validation_nmf = nmf.fit_transform(X_validation)
X_test_nmf = nmf.fit_transform(X_test)

In [18]:
knn_nmf = KNeighborsRegressor()
knn_nmf.fit(X_train_nmf, y_train)
y_validation_nmf_pred = knn_nmf.predict(X_validation_nmf)

In [19]:
mean_squared_error(y_validation, y_validation_nmf_pred, squared=False)

0.9295720490454715

In [20]:
mean_absolute_error(y_validation, y_validation_nmf_pred)

0.7296505313870757

In [21]:
tsne = TSNE()
X_train_tsne = tsne.fit_transform(X_train)
X_validation_tsne = tsne.fit_transform(X_validation)
X_test_tsne = tsne.fit_transform(X_test)

In [22]:
knn_tsne = KNeighborsRegressor()
knn_tsne.fit(X_train_tsne, y_train)
y_validation_tsne_pred = knn_tsne.predict(X_validation_tsne)

In [23]:
mean_squared_error(y_validation, y_validation_tsne_pred, squared=False)

0.9494226522066399

In [24]:
mean_absolute_error(y_validation, y_validation_tsne_pred)

0.7913151697750694

In [25]:
pt = PowerTransformer()
X_train_pt = pt.fit_transform(X_train)
X_validation_pt = pt.fit_transform(X_validation)
X_test_pt = pt.fit_transform(X_test)

In [26]:
knn_pt = KNeighborsRegressor()
knn_pt.fit(X_train_pt, y_train)
y_validation_pt_pred = knn_pt.predict(X_validation_pt)

In [27]:
mean_squared_error(y_validation, y_validation_pt_pred, squared=False)

0.7058374776107377

In [28]:
mean_absolute_error(y_validation, y_validation_pt_pred)

0.573939941618395