In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_validate

In [41]:
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6058 entries, 0 to 6057
Data columns (total 73 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   popularity               6058 non-null   float64
 1   budget                   6058 non-null   float64
 2   runtime                  6058 non-null   float64
 3   vote_average             6058 non-null   float64
 4   vote_count               6058 non-null   float64
 5   profit_margin            6058 non-null   float64
 6   overview: life           6058 non-null   int64  
 7   overview: new            6058 non-null   int64  
 8   overview: young          6058 non-null   int64  
 9   overview: man            6058 non-null   int64  
 10  overview: world          6058 non-null   int64  
 11  overview: family         6058 non-null   int64  
 12  overview: love           6058 non-null   int64  
 13  overview: woman          6058 non-null   int64  
 14  overview: story         

In [43]:
X_train = train_df.drop(columns='profit_margin')
y_train = train_df['profit_margin']
X_validation = validation_df.drop(columns='profit_margin')
y_validation = validation_df['profit_margin']
X_test = test_df.drop(columns='profit_margin')
y_test = test_df['profit_margin']

In [132]:
knn = KNeighborsRegressor()
y_validation_pred = knn.fit(X_train, y_train).predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_pred)
print(rmse)
print(mae)

0.7297008670625598
0.5862147672013993


In [133]:
y_test_pred = knn.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)
print(rmse)
print(mae)

0.74848615637268
0.6205282207673316


In [77]:
errors = []

for k in range(1, 11):
    knn = KNeighborsRegressor(n_neighbors=k)
    y_validation_pred = knn.fit(X_train, y_train).predict(X_validation)
    rmse = mean_squared_error(y_validation, y_validation_pred, squared=False)
    mae = mean_absolute_error(y_validation, y_validation_pred)
    errors.append([k, rmse, mae])

errors_df = pd.DataFrame(errors, columns=['k', 'rmse', 'mae'])
errors_df.set_index('k', inplace=True)
# errors_df.plot.bar(x='k')
# plt.show()
errors_df

Unnamed: 0_level_0,rmse,mae
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.982311,0.770256
2,0.815751,0.637314
3,0.76211,0.600762
4,0.743401,0.592645
5,0.729701,0.586215
6,0.72623,0.58396
7,0.719044,0.581447
8,0.715052,0.57935
9,0.713323,0.585036
10,0.711666,0.58603


In [45]:
fig = px.bar(errors_df, x='k', y=['rmse', 'mae'], barmode='group')
fig.update_layout(yaxis_title='error')
fig.show()

In [107]:
X_full_train = pd.concat([X_train, X_validation])
y_full_train = pd.concat([y_train, y_validation])
knn = KNeighborsRegressor() # using default n_neighbors = 5
knn_scores = cross_validate(knn, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
knn_scores['test_r2'].mean()

-0.0007576566474793766

In [131]:
errors_pca = []

for components in range(1, 11):
    pca = PCA(n_components=components)
    X_train_pca = pd.DataFrame(pca.fit_transform(X_train))
    X_validation_pca = pd.DataFrame(pca.fit_transform(X_validation))
    X_test_pca = pd.DataFrame(pca.fit_transform(X_test))
    knn_pca = KNeighborsRegressor()
    knn_pca.fit(X_train_pca, y_train)
    y_validation_pca_pred = knn_pca.predict(X_validation_pca)
    rmse = mean_squared_error(y_validation, y_validation_pca_pred, squared=False)
    mae = mean_absolute_error(y_validation, y_validation_pca_pred)
    errors_pca.append([components, rmse, mae])

errors_pca_df = pd.DataFrame(errors_pca, columns=['components', 'rmse', 'mae'])
errors_pca_df.set_index('components', inplace=True)
# errors_df.plot.bar(x='k')
# plt.show()
errors_pca_df

Unnamed: 0_level_0,rmse,mae
components,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.846638,0.708632
2,0.816696,0.680411
3,0.813204,0.671662
4,0.846679,0.702631
5,0.852512,0.710436
6,0.841202,0.687216
7,0.827995,0.688087
8,0.844047,0.703035
9,0.829185,0.685159
10,0.833208,0.687813


In [134]:
pca = PCA(n_components=3)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train))
X_validation_pca = pd.DataFrame(pca.fit_transform(X_validation))
X_test_pca = pd.DataFrame(pca.fit_transform(X_test))

In [135]:
knn_pca = KNeighborsRegressor()
knn_pca.fit(X_train_pca, y_train)
y_validation_pca_pred = knn_pca.predict(X_validation_pca)

In [136]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    X_train_pca,
    labels=labels,
    dimensions=range(2)
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [137]:
sum = 0
for i, var in enumerate(pca.explained_variance_ratio_ * 100):
    print(i, var)
    sum += var
print(sum)

0 8.039474459567622
1 7.344917223416755
2 4.987494308437186
20.37188599142156


In [138]:
mean_squared_error(y_validation, y_validation_pca_pred, squared=False)

0.8137998394488192

In [139]:
mean_absolute_error(y_validation, y_validation_pca_pred)

0.6719269989109765

In [140]:
knn_pca_scores = cross_validate(knn, pd.concat([X_train_pca, X_validation_pca]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
knn_pca_scores['test_r2'].mean()

-0.17631850461446255

In [141]:
y_test_pca_pred = knn_pca.predict(X_test_pca)
rmse = mean_squared_error(y_test, y_test_pca_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pca_pred)
print(rmse)
print(mae)

0.8660872351848007
0.7093603283419359


In [116]:
poly = PolynomialFeatures()
X_train_poly = pd.DataFrame(poly.fit_transform(X_train))
X_validation_poly = pd.DataFrame(poly.fit_transform(X_validation))
X_test_poly = pd.DataFrame(poly.fit_transform(X_test))

In [53]:
knn_poly = KNeighborsRegressor()
knn_poly.fit(X_train_poly, y_train)
y_validation_poly_pred = knn_poly.predict(X_validation_poly)

In [54]:
mean_squared_error(y_validation, y_validation_poly_pred, squared=False)

0.7517339572147149

In [55]:
mean_absolute_error(y_validation, y_validation_poly_pred)

0.6107020927695729

In [117]:
knn_poly_scores = cross_validate(knn, pd.concat([X_train_poly, X_validation_poly]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
knn_poly_scores['test_r2'].mean()

-0.05303269738774391

In [142]:
y_test_poly_pred = knn_poly.predict(X_test_poly)
rmse = mean_squared_error(y_test, y_test_poly_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_poly_pred)
print(rmse)
print(mae)

0.7663638165652618
0.6364074364308808


In [118]:
nmf = NMF(n_components=2)
X_train_nmf = pd.DataFrame(nmf.fit_transform(X_train))
X_validation_nmf = pd.DataFrame(nmf.fit_transform(X_validation))
X_test_nmf = pd.DataFrame(nmf.fit_transform(X_test))


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.



In [57]:
knn_nmf = KNeighborsRegressor()
knn_nmf.fit(X_train_nmf, y_train)
y_validation_nmf_pred = knn_nmf.predict(X_validation_nmf)

In [58]:
mean_squared_error(y_validation, y_validation_nmf_pred, squared=False)

0.808973698343397

In [59]:
mean_absolute_error(y_validation, y_validation_nmf_pred)

0.6396932334177678

In [119]:
knn_nmf_scores = cross_validate(knn, pd.concat([X_train_nmf, X_validation_nmf]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
knn_nmf_scores['test_r2'].mean()

-0.17320121906556482

In [143]:
y_test_nmf_pred = knn_nmf.predict(X_test_nmf)
rmse = mean_squared_error(y_test, y_test_nmf_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_nmf_pred)
print(rmse)
print(mae)

0.8404353441160558
0.6669936526513788


In [120]:
tsne = TSNE()
X_train_tsne = pd.DataFrame(tsne.fit_transform(X_train))
X_validation_tsne = pd.DataFrame(tsne.fit_transform(X_validation))
X_test_tsne = pd.DataFrame(tsne.fit_transform(X_test))

In [61]:
knn_tsne = KNeighborsRegressor()
knn_tsne.fit(X_train_tsne, y_train)
y_validation_tsne_pred = knn_tsne.predict(X_validation_tsne)

In [62]:
mean_squared_error(y_validation, y_validation_tsne_pred, squared=False)

0.8421009977167205

In [63]:
mean_absolute_error(y_validation, y_validation_tsne_pred)

0.7035778404384397

In [121]:
knn_tsne_scores = cross_validate(knn, pd.concat([X_train_tsne, X_validation_tsne]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
knn_tsne_scores['test_r2'].mean()

-0.15653012056881288

In [144]:
y_test_tsne_pred = knn_tsne.predict(X_test_tsne)
rmse = mean_squared_error(y_test, y_test_tsne_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_tsne_pred)
print(rmse)
print(mae)

0.8585734625345905
0.7160793518458234


In [122]:
pt = PowerTransformer()
X_train_pt = pd.DataFrame(pt.fit_transform(X_train))
X_validation_pt = pd.DataFrame(pt.fit_transform(X_validation))
X_test_pt = pd.DataFrame(pt.fit_transform(X_test))

In [65]:
knn_pt = KNeighborsRegressor()
knn_pt.fit(X_train_pt, y_train)
y_validation_pt_pred = knn_pt.predict(X_validation_pt)

In [66]:
mean_squared_error(y_validation, y_validation_pt_pred, squared=False)

0.7315331878457773

In [67]:
mean_absolute_error(y_validation, y_validation_pt_pred)

0.5971074082290373

In [123]:
knn_pt_scores = cross_validate(knn, pd.concat([X_train_pt, X_validation_pt]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
knn_pt_scores['test_r2'].mean()

0.002775580779031217

In [145]:
y_test_pt_pred = knn_pt.predict(X_test_pt)
rmse = mean_squared_error(y_test, y_test_pt_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pt_pred)
print(rmse)
print(mae)

0.7607432479492361
0.6231975570306222
