## Libraries

In [1]:
# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import plotly.express as px
import xgboost as xgb

import seaborn as sns

sns.set()

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

In [2]:
from fast_ml.model_development import train_valid_test_split

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
#from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import requests
from bs4 import BeautifulSoup
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
#!pip install fast-ml

## Take a first look at the data

In [5]:
cin = pd.read_csv('cin_prep.csv')

In [6]:
cin.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,war_symb_title,point_symb_title,movie_description,genres_split
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,237000000.0,2009.0,936.0,7.9,1.78,33000,4,0,A paraplegic Marine dispatched to the moon Pan...,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,300000000.0,2007.0,5000.0,7.1,2.35,0,8,0,"Captain Barbossa, Will Turner and Elizabeth Sw...","['Action', 'Adventure', 'Fantasy']"
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,245000000.0,2015.0,393.0,6.8,2.35,85000,1,0,A cryptic message from James Bond's past sends...,"['Action', 'Adventure', 'Thriller']"
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,250000000.0,2012.0,23000.0,8.5,2.35,164000,3,0,"Eight years after the Joker's reign of chaos, ...","['Action', 'Thriller']"
4,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,263700000.0,2012.0,632.0,6.6,2.35,24000,3,0,"Transported to Barsoom, a Civil War vet discov...","['Action', 'Adventure', 'Sci-Fi']"


In [7]:
cin.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'war_symb_title',
       'point_symb_title', 'movie_description', 'genres_split'],
      dtype='object')

# Let's drop columns that we won't use now

In [8]:
df = cin

In [9]:
df = df.drop(['aspect_ratio'], axis=1)

In [10]:
df = df.drop(['genres_split'], axis=1)

In [11]:
df = df.drop(['point_symb_title'], axis=1)

In [12]:
df = df.drop(['movie_imdb_link'], axis=1)

In [13]:
df = df.drop(['war_symb_title'], axis=1)

In [14]:
df = df.drop(['movie_description'], axis=1)

In [15]:
df = df.drop(['facenumber_in_poster'], axis=1)

In [16]:
df = df.drop(['actor_2_name'], axis=1)

In [17]:
df = df.drop(['movie_title'], axis=1)

In [18]:
df = df.drop(['actor_3_name'], axis=1)

In [19]:
df = df.drop(['plot_keywords'], axis=1)

# Let's split our data in train, test, val sets

In [20]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'gross', 
                                                                            method='sorted', sort_by_col='gross',
                                                                            train_size=0.8, valid_size=0.1, 
                                                                            test_size=0.1)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(3293, 20)
(3293,)
(411, 20)
(411,)
(413, 20)
(413,)


(None, None)

## Now we will work only with train set. Let's do encoding

In [21]:
X_train.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,genres,actor_1_name,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,movie_facebook_likes
3039,Color,Ekachai Uekrongtham,66.0,96.0,3.0,305.0,2000.0,Action|Crime|Thriller,Michael Jai White,5228,2802,38,English,Thailand,R,9000000.0,2014.0,368.0,5.7,0
3903,Color,Frank Whaley,9.0,96.0,436.0,4.0,474.0,Comedy|Drama,Lynn Cohen,480,920,21,English,USA,R,1500000.0,2001.0,436.0,5.4,47
3902,Color,Brian Trenchard-Smith,8.0,88.0,53.0,176.0,563.0,Action|Drama,David Keith,783,1284,12,English,Germany,R,1000000.0,2006.0,288.0,4.1,42
3429,Color,Robert Sarkies,43.0,100.0,0.0,3.0,109.0,Crime|Drama,William Kircher,2776,122,22,English,New Zealand,R,98058050.0,2006.0,10.0,7.3,285
3213,Color,Ian Fitzgibbon,54.0,88.0,11.0,415.0,1000.0,Action|Comedy|Crime|Drama|Romance|Thriller,Jim Broadbent,5133,2728,31,English,Ireland,R,41033820.0,2009.0,418.0,6.4,663


# cat val encoding

## one hot encoding

In [22]:
data = X_train

### color

In [23]:
encoded_colors = pd.get_dummies(data['color'], prefix='color')
data = pd.concat([data, encoded_colors], axis=1)
data.drop('color', axis=1, inplace=True)

### genres_split

In [24]:
data['genres'] = data['genres'].fillna('')
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres.split('|'))
genre_columns = []
for genre in unique_genres:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
    genre_columns.append(genre)
data['genre_OOV'] = data['genres'].apply(lambda x: 1 if not any(genre in x.split('|') for genre in genre_columns) else 0)
data.drop('genres', axis=1, inplace=True)

### content_rating

In [25]:
unique_ratings = data['content_rating'].unique()
rating_columns = []
for rating in unique_ratings:
    data[f"rating_{rating}"] = data['content_rating'].apply(lambda x: 1 if x == rating else 0)
    rating_columns.append(f"rating_{rating}")
data['content_rating_OOV'] = data['content_rating'].apply(lambda x: 1 if x not in unique_ratings else 0)
#data.drop('content_rating', axis=1, inplace=True)

## Let's separate countries and languages in 2 categories

### country

In [26]:
data['USA'] = data['country'].apply(lambda x: 1 if x == 'USA' else 0)
data['not_USA'] = data['country'].apply(lambda x: 1 if x != 'USA' else 0)
data.drop('country', axis=1, inplace=True)

### language

In [27]:
data['English'] = data['language'].apply(lambda x: 1 if x == 'English' else 0)
data['not_English'] = data['language'].apply(lambda x: 1 if x != 'English' else 0)
data.drop('language', axis=1, inplace=True)

## Bucketing of director_name by budget

In [28]:
director_avg_budget = data.groupby('director_name')['budget'].mean()
num_buckets = 5
bucket_labels = [f'bucket {i+1}' for i in range(num_buckets)]
data['director_bucket'] = pd.qcut(data['director_name'].map(director_avg_budget), 
                                  q=num_buckets, labels=bucket_labels)

In [29]:
encoded_director = pd.get_dummies(data['director_bucket'], prefix='Director')
data = pd.concat([data, encoded_director], axis=1)

In [30]:
data.drop('director_bucket', axis=1, inplace=True)
data.drop('director_name', axis=1, inplace=True)

## Bucketing of actor_1_name by actor_1_fb_likes

In [31]:
actor_1_avg_likes = data.groupby('actor_1_name')['actor_1_facebook_likes'].mean()
num_buckets = 5
bucket_labels = [f'bucket {i+1}' for i in range(num_buckets)]
data['actor_1_bucket'] = pd.qcut(data['actor_1_name'].map(actor_1_avg_likes),
                                 q=num_buckets, labels=bucket_labels)

In [32]:
encoded_actor_1 = pd.get_dummies(data['actor_1_bucket'], prefix='Actor1')
data = pd.concat([data, encoded_actor_1], axis=1)

In [33]:
data.drop('actor_1_bucket', axis=1, inplace=True)
data.drop('actor_1_name', axis=1, inplace=True)

## Now let's prepare X_test

In [34]:
encoded_colors = pd.get_dummies(X_test['color'], prefix='color')
X_test = pd.concat([X_test, encoded_colors], axis=1)
X_test.drop('color', axis=1, inplace=True)

In [35]:
X_test['genres'] = X_test['genres'].fillna('')
for genre in genre_columns:
    X_test[genre] = X_test['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
X_test['genre_OOV'] = X_test['genres'].apply(lambda x: 1 if not any(genre in x.split('|') for 
                                                                    genre in genre_columns) else 0)
X_test.drop('genres', axis=1, inplace=True)

In [36]:
unique_ratings = data['content_rating'].unique()
for rating in rating_columns:
    if rating in X_test.columns:
        X_test[rating] = X_test['content_rating'].apply(lambda x: 1 if x == rating.split('_')[1] else 0)
X_test['content_rating_OOV'] = X_test['content_rating'].apply(lambda x: 1 if x not in unique_ratings else 0)
X_test.drop('content_rating', axis=1, inplace=True)

In [37]:
X_test['USA'] = X_test['country'].apply(lambda x: 1 if x == 'USA' else 0)
X_test['not_USA'] = X_test['country'].apply(lambda x: 1 if x != 'USA' else 0)
X_test.drop('country', axis=1, inplace=True)

In [38]:
X_test['English'] = X_test['language'].apply(lambda x: 1 if x == 'English' else 0)
X_test['not_English'] = X_test['language'].apply(lambda x: 1 if x != 'English' else 0)
X_test.drop('language', axis=1, inplace=True)

In [39]:
actor_1_avg_likes = X_test.groupby('actor_1_name')['actor_1_facebook_likes'].mean()
num_buckets = 5
bucket_labels = [f'bucket {i+1}' for i in range(num_buckets)]
X_test['actor_1_bucket'] = pd.qcut(X_test['actor_1_name'].map(actor_1_avg_likes),
                                 q=num_buckets, labels=bucket_labels)

In [40]:
encoded_actor_1 = pd.get_dummies(X_test['actor_1_bucket'], prefix='Actor1')
X_test = pd.concat([X_test, encoded_actor_1], axis=1)

In [41]:
director_avg_budget = X_test.groupby('director_name')['budget'].mean()
num_buckets = 5
bucket_labels = [f'bucket {i+1}' for i in range(num_buckets)]
X_test['director_bucket'] = pd.qcut(X_test['director_name'].map(director_avg_budget), 
                                  q=num_buckets, labels=bucket_labels)

In [42]:
encoded_director = pd.get_dummies(X_test['director_bucket'], prefix='Director')
X_test = pd.concat([X_test, encoded_director], axis=1)

In [43]:
X_test.drop('director_bucket', axis=1, inplace=True)
X_test.drop('director_name', axis=1, inplace=True)

In [44]:
X_test.drop('actor_1_bucket', axis=1, inplace=True)
X_test.drop('actor_1_name', axis=1, inplace=True)

In [45]:
data.drop('content_rating', axis=1, inplace=True)

In [46]:
def remove_unknown_tokens(data, known_tokens, oov_token):
    data_encoded = data.copy()
    unknown_tokens = set(data.columns) - set(known_tokens)
    missing_tokens = set(known_tokens) - set(data.columns)
    data_encoded.loc[:, unknown_tokens] = oov_token
    data_encoded.loc[:, missing_tokens] = 0
    return data_encoded[known_tokens]

known_tokens = list(data.columns)

oov_token = -1

data_encoded = remove_unknown_tokens(data, known_tokens, oov_token)
X_test_encoded = remove_unknown_tokens(X_test, known_tokens, oov_token)

print("X_train before encoding: \n", data.columns)
print("\n X_test before encoding: \n", X_test.columns)
print("\n X_train after encoding: \n", data_encoded.columns)
#print("\n X_test after encoding: \n", X_test_encoded.columns)

X_train before encoding: 
 Index(['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_1_facebook_likes', 'num_voted_users',
       'cast_total_facebook_likes', 'num_user_for_reviews', 'budget',
       'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'movie_facebook_likes', 'color_Black and White', 'color_Color',
       'Mystery', 'Animation', 'Western', 'Music', 'News', 'Thriller',
       'Horror', 'Biography', 'Fantasy', 'Romance', 'Film-Noir', 'Musical',
       'Short', 'Drama', 'Comedy', 'History', 'Crime', 'Sci-Fi', 'Documentary',
       'Family', 'Sport', 'War', 'Adventure', 'Action', 'genre_OOV',
       'rating_R', 'rating_PG-13', 'rating_Not Rated', 'rating_PG',
       'rating_Approved', 'rating_G', 'rating_NC-17', 'content_rating_OOV',
       'USA', 'not_USA', 'English', 'not_English', 'Director_bucket 1',
       'Director_bucket 2', 'Director_bucket 3', 'Director_bucket 4',
       'Director_bucket 5', 'Actor1_buc

 drop out -> 
 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'actor_1_facebook_likes', 
 'num_voted_users',  'num_user_for_reviews'

### Let's drop out some columns

In [47]:
train_data_1 = data_encoded

In [48]:
test_data_1 = X_test_encoded

In [49]:
train_data_1.drop('actor_3_facebook_likes', axis=1, inplace=True)

In [50]:
test_data_1.drop('actor_3_facebook_likes', axis=1, inplace=True)

In [51]:
train_data_1.drop('actor_2_facebook_likes', axis=1, inplace=True)

In [52]:
test_data_1.drop('actor_2_facebook_likes', axis=1, inplace=True)

In [53]:
train_data_1.drop('actor_1_facebook_likes', axis=1, inplace=True)

In [54]:
test_data_1.drop('actor_1_facebook_likes', axis=1, inplace=True)

In [55]:
train_data_1.drop('num_voted_users', axis=1, inplace=True)

In [56]:
test_data_1.drop('num_voted_users', axis=1, inplace=True)

In [57]:
train_data_1.drop('num_user_for_reviews', axis=1, inplace=True)

In [58]:
test_data_1.drop('num_user_for_reviews', axis=1, inplace=True)

## Building a model

### RF Regressor

In [59]:
regression_model = RandomForestRegressor(n_estimators=50) 
regression_model.fit(train_data_1, y_train)

In [60]:
y_pred = regression_model.predict(test_data_1)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 3.5300072887351972e+16
Mean Absolute Error (MAE): 165351396.13128328
Mean Absolute Percentage Error (MAPE): 0.7609329808413399
R-squared Score: -3.27557515882469


### XGB Regressor

In [79]:
regression_model = xgb.XGBRFRegressor(learning_rate=0.9, subsample=0.8, colsample_bynode=0.8, reg_lambda=1e-9)
regression_model.fit(train_data_1, y_train)

In [80]:
y_pred = regression_model.predict(test_data_1)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 3.787098727140208e+16
Mean Absolute Error (MAE): 173145427.06779662
Mean Absolute Percentage Error (MAPE): 0.8032742568531692
R-squared Score: -3.5869665180150045


## TRied to make it better, but tired of waiting

param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],  # [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

regression_model = RandomForestRegressor()

grid_search = GridSearchCV(regression_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(data_encoded, y_train)

best_regression_model = grid_search.best_estimator_

best_regression_model.fit(data_encoded, y_train)

mse_scores = cross_val_score(best_regression_model, data_encoded, 
                             y_train, scoring='neg_mean_squared_error', cv=5)
mse_mean = -mse_scores.mean()

y_pred = best_regression_model.predict(X_test_encoded)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R-squared Score:", r2)

param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'reg_lambda': [1e-5, 1e-4, 1e-3]
}

regression_model = xgb.XGBRFRegressor()

grid_search = GridSearchCV(regression_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(data_encoded, y_train)

best_regression_model = grid_search.best_estimator_

best_regression_model.fit(data_encoded, y_train)

mse_scores = cross_val_score(best_regression_model, data_encoded, y_train, scoring='neg_mean_squared_error', cv=5)
mse_mean = -mse_scores.mean()

y_pred = best_regression_model.predict(X_test_encoded)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 5.066125921522722e+16
Mean Absolute Error (MAE): 205987429.9751816
Mean Absolute Percentage Error (MAPE): 0.9770687195402262
R-squared Score: -5.136135245573781

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'reg_lambda': [1e-5, 1e-4, 1e-3]
}

regression_model = xgb.XGBRFRegressor()

grid_search = GridSearchCV(regression_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(data_encoded, y_train)

best_regression_model = grid_search.best_estimator_

best_regression_model.fit(data_encoded, y_train)

mse_scores = cross_val_score(best_regression_model, data_encoded, y_train, scoring='neg_mean_squared_error', cv=5)
mse_mean = -mse_scores.mean()

y_pred = best_regression_model.predict(X_test_encoded)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 5.06617276369964e+16
Mean Absolute Error (MAE): 205990957.03177965
Mean Absolute Percentage Error (MAPE): 0.9770913413548851
R-squared Score: -5.136191981220946

param_grid = {
    'n_estimators': [100, 300, 1000],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'reg_lambda': [1e-5, 1e-4, 1e-3]
}

regression_model = xgb.XGBRFRegressor()

grid_search = GridSearchCV(regression_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(data_encoded, y_train)

best_regression_model = grid_search.best_estimator_

best_regression_model.fit(data_encoded, y_train)

mse_scores = cross_val_score(best_regression_model, data_encoded, y_train, scoring='neg_mean_squared_error', cv=5)
mse_mean = -mse_scores.mean()

y_pred = best_regression_model.predict(X_test_encoded)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 5.066095895413197e+16
Mean Absolute Error (MAE): 205987068.51846248
Mean Absolute Percentage Error (MAPE): 0.9770667590316227
R-squared Score: -5.136098877691934