import stuff (a lot of stuff) & read in data

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
import datetime as dt
pd.options.mode.chained_assignment = None
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
pd.options.display.max_rows = 1000
pd.options.display.max_columns= 1000
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('../datasets/ks-projects-201801.csv')

In [3]:
df = data.copy()

# data cleaning / transforming

In [19]:
df['titleLength'] = df['name'].apply(lambda x: len(str(x)))

In [20]:
df.drop(columns=['category','backers','ID','name','state','pledged','usd pledged','goal'], inplace=True)

In [21]:
df

Unnamed: 0,main_category,currency,deadline,launched,country,usd_pledged_real,usd_goal_real,titleLength
0,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,GB,0.0,1533.95,31
1,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,US,2421.0,30000.00,45
2,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,US,220.0,45000.00,14
3,Music,USD,2012-04-16,2012-03-17 03:24:11,US,1.0,5000.00,49
4,Film & Video,USD,2015-08-29,2015-07-04 08:35:03,US,1283.0,19500.00,58
...,...,...,...,...,...,...,...,...
378656,Film & Video,USD,2014-10-17,2014-09-17 02:35:30,US,25.0,50000.00,49
378657,Film & Video,USD,2011-07-19,2011-06-22 03:35:14,US,155.0,1500.00,9
378658,Film & Video,USD,2010-08-16,2010-07-01 19:40:30,US,20.0,15000.00,71
378659,Technology,USD,2016-02-13,2016-01-13 18:13:53,US,200.0,15000.00,24


In [11]:
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df = df[df['launched']>'2000-01-01']

In [12]:
df = data[(data['usd_pledged_real']<20000) & (data['usd_pledged_real']>0)]
df = df[(df['usd_goal_real']<20000) & (df['usd_goal_real']>0)]

In [13]:
df['logPledged'] = np.log(df['usd_pledged_real'])
df['logGoal'] = np.log(df['usd_goal_real'])

df.drop(columns=['usd_goal_real','usd_pledged_real'], inplace=True)

In [14]:
df['launchMonth'] = df['launched'].dt.month
df['launchDay'] = df['launched'].dt.dayofweek
df['launchHour'] = df['launched'].dt.hour

AttributeError: Can only use .dt accessor with datetimelike values

In [95]:
df['duration'] = (df['deadline']-df['launched'])/dt.timedelta(minutes=1)
df.drop(columns=['deadline','launched'],inplace=True)

In [96]:
X = df.drop(columns=['logPledged'])
y = df['logPledged']

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [98]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['launchMonth','launchDay','launchHour'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).columns) + ['launchMonth','launchDay','launchHour']

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', ce.OneHotEncoder(), categorical_features)])

In [30]:
from sklearn.model_selection import GridSearchCV

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor())])
param_grid = { 
    'regressor__max_depth' : [5,10,20],
    'regressor__min_samples_leaf': [10,50,100]
}

CV = GridSearchCV(rf, param_grid, n_jobs=1,verbose=2)
                  
CV.fit(X, y)  
print(CV.best_params_)    
print(CV.best_score_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] regressor__max_depth=5, regressor__min_samples_leaf=10 ..........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=10, total=   8.8s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=10 ..........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.8s remaining:    0.0s


[CV]  regressor__max_depth=5, regressor__min_samples_leaf=10, total=  15.0s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=10 ..........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=10, total=   9.3s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=50 ..........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=50, total=   9.9s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=50 ..........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=50, total=  10.0s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=50 ..........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=50, total=  10.3s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=100 .........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=100, total=   9.9s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=100 .........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=100, total=  12.9s
[CV] regressor__max_depth=5, regressor__min_samples_leaf=100 .........




[CV]  regressor__max_depth=5, regressor__min_samples_leaf=100, total=  15.8s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=10 .........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=10, total=  29.3s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=10 .........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=10, total=  27.5s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=10 .........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=10, total=  33.8s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50 .........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, total=  27.0s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50 .........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, total=  16.7s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50 .........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, total=  20.3s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=100 ........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=100, total=  20.9s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=100 ........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=100, total=  16.3s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=100 ........




[CV]  regressor__max_depth=10, regressor__min_samples_leaf=100, total=  18.9s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=10 .........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=10, total=  27.8s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=10 .........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=10, total=  33.6s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=10 .........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=10, total=  28.3s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=50 .........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=50, total=  25.3s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=50 .........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=50, total=  28.1s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=50 .........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=50, total=  28.2s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=100 ........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=100, total=  33.1s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=100 ........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=100, total=  26.1s
[CV] regressor__max_depth=20, regressor__min_samples_leaf=100 ........




[CV]  regressor__max_depth=20, regressor__min_samples_leaf=100, total=  18.9s


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  9.4min finished


{'regressor__max_depth': 20, 'regressor__min_samples_leaf': 100}
0.1440855187293938


In [112]:
y

3        0.00000
4        7.15696
6        7.09423
11       9.44936
15       6.49828
           ...  
378654   5.03695
378657   5.04343
378658   2.99573
378659   5.29832
378660   6.26149
Name: logPledged, Length: 245957, dtype: float64

In [14]:
regressors = [
        LogisitcRegression()
        DecisionTreeClassifier(max_depth=20, min_samples_leaf=50),
        RandomForestClassifier(max_depth=20, min_samples_leaf=50),
        MLPClassifier()
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    print("model score: %.3f" % pipe.score(X_test, y_test))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
model score: 0.069
DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=50,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
model score: 0.111




RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=50, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
model score: 0.146
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
model score: 0.143


In [15]:
numeric_features

Index(['titleLength', 'logGoal', 'duration'], dtype='object')

In [16]:
categorical_features 

['main_category',
 'currency',
 'country',
 'launchMonth',
 'launchDay',
 'launchHour']

In [17]:
regressors = [
        MLPRegressor(hidden_layer_sizes=(50,10,50), verbose=True),
        MLPRegressor(hidden_layer_sizes=(25,10,25), verbose=True)
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    print("model score: %.3f" % pipe.score(X_test, y_test))

Iteration 1, loss = 2.96807231
Iteration 2, loss = 2.56446634
Iteration 3, loss = 2.54674861
Iteration 4, loss = 2.53151186
Iteration 5, loss = 2.52504591
Iteration 6, loss = 2.51269126
Iteration 7, loss = 2.50581863
Iteration 8, loss = 2.50135791
Iteration 9, loss = 2.49812243
Iteration 10, loss = 2.49409654
Iteration 11, loss = 2.49038793
Iteration 12, loss = 2.48806069
Iteration 13, loss = 2.48409364
Iteration 14, loss = 2.48237889
Iteration 15, loss = 2.48184141
Iteration 16, loss = 2.47797948
Iteration 17, loss = 2.47605688
Iteration 18, loss = 2.47451411
Iteration 19, loss = 2.47149246
Iteration 20, loss = 2.47140902
Iteration 21, loss = 2.47170700
Iteration 22, loss = 2.46855527
Iteration 23, loss = 2.46677962
Iteration 24, loss = 2.46493043
Iteration 25, loss = 2.46474464
Iteration 26, loss = 2.46251633
Iteration 27, loss = 2.46024923
Iteration 28, loss = 2.45951591
Iteration 29, loss = 2.45856873
Iteration 30, loss = 2.45685183
Iteration 31, loss = 2.45501231
Iteration 32, los

Iteration 40, loss = 2.47285577
Iteration 41, loss = 2.47085144
Iteration 42, loss = 2.46874094
Iteration 43, loss = 2.47078063
Iteration 44, loss = 2.46760094
Iteration 45, loss = 2.46758327
Iteration 46, loss = 2.46549562
Iteration 47, loss = 2.46384665
Iteration 48, loss = 2.46323882
Iteration 49, loss = 2.46243582
Iteration 50, loss = 2.46113888
Iteration 51, loss = 2.45971028
Iteration 52, loss = 2.45825170
Iteration 53, loss = 2.45689490
Iteration 54, loss = 2.45752918
Iteration 55, loss = 2.45628993
Iteration 56, loss = 2.45527935
Iteration 57, loss = 2.45412296
Iteration 58, loss = 2.45367511
Iteration 59, loss = 2.45169243
Iteration 60, loss = 2.45024347
Iteration 61, loss = 2.44779608
Iteration 62, loss = 2.44829101
Iteration 63, loss = 2.44838606
Iteration 64, loss = 2.44637185
Iteration 65, loss = 2.44669668
Iteration 66, loss = 2.44338316
Iteration 67, loss = 2.44328457
Iteration 68, loss = 2.44212423
Iteration 69, loss = 2.44246755
Iteration 70, loss = 2.44030493
Iteratio



model score: 0.148


# testing geo/currency

In [99]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['launchMonth','launchDay','launchHour'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).drop(['currency'],axis=1)) + ['launchMonth','launchDay','launchHour']

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', ce.OneHotEncoder(), categorical_features)])

In [100]:
categorical_features

['main_category', 'country', 'launchMonth', 'launchDay', 'launchHour']

In [102]:
regressors = [
        RandomForestRegressor(max_depth=20, min_samples_leaf=50, n_estimators=20),
        MLPRegressor(verbose=True)
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    print("model score: %.3f" % pipe.score(X_test, y_test))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=50, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
model score: 0.150
Iteration 1, loss = 2.80873759
Iteration 2, loss = 2.45623672
Iteration 3, loss = 2.43627204
Iteration 4, loss = 2.42530815
Iteration 5, loss = 2.42091731
Iteration 6, loss = 2.40867663
Iteration 7, loss = 2.40531594
Iteration 8, loss = 2.40372776
Iteration 9, loss = 2.39997618
Iteration 10, loss = 2.39708740
Iteration 11, loss = 2.39461412
Iteration 12, loss = 2.39218906
Iteration 13, loss = 2.39374626
Iteration 14, loss = 2.39000960
Iteration 15, loss = 2.38742010
Iteration 16, loss = 2.38634258
Iteration 17, lo



model score: 0.142


# another gridsearch

In [33]:
from sklearn.model_selection import GridSearchCV

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor())])
param_grid = { 
    'regressor__max_depth' : [10,20],
    'regressor__min_samples_leaf': [50,100],
    'regressor__n_estimators':[10,20]
}

CV = GridSearchCV(rf, param_grid, n_jobs=1,verbose=2)
                  
CV.fit(X, y)  
print(CV.best_params_)    
print(CV.best_score_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=10 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=10, total=  11.4s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.5s remaining:    0.0s


[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=10, total=  10.9s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=10 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=10, total=  10.9s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=20 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=20, total=  21.0s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=20 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=20, total=  20.8s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=20 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=50, regressor__n_estimators=20, total=  21.7s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=100, regressor__n_estimators=10 
[CV]  regressor__max_dep

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  7.5min finished


{'regressor__max_depth': 20, 'regressor__min_samples_leaf': 50, 'regressor__n_estimators': 20}
0.1469055615840664


# final evaluation

In [54]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor(max_depth=20, min_samples_leaf=50, n_estimators=20))])
pipe.fit(X_train, y_train)   

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  Index(['titleLength', 'logGoal', 'duration'], dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(cols=None,
                                                                drop_invariant=False,
                                                                handle_missing='value',
                                              

In [55]:
y_pred = pipe.predict(X_test)

In [56]:
mae = metrics.mean_absolute_error(y_test,y_pred)
mse = metrics.mean_squared_error(y_test,y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
r2 = metrics.r2_score(y_test, y_pred)

In [57]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  1.795077554986695
mse:  4.982071590726378
rmse:  2.2320554631832916
r2:  0.14888980633058158


In [62]:
df = pd.DataFrame(data=list(zip(list(y_test), list(y_pred))),columns=['actual','predicted'])
df['goal'] = list(X_test['logGoal'])

In [64]:
#unlog the values
for col in df.columns:
    df[col] = df[col].apply(lambda x: 10**x)

In [65]:
df

Unnamed: 0,actual,predicted,goal
0,4.851432e+09,1.849973e+07,2.466423e+10
1,2.804166e+07,5.158715e+05,2.081312e+06
2,8.218860e+08,1.949611e+06,7.973090e+08
3,6.282213e+07,6.097497e+05,6.603562e+10
4,4.879189e+08,1.948956e+07,4.097353e+08
...,...,...,...
59793,2.694816e+05,5.214472e+06,1.726996e+08
59794,1.572847e+07,1.557755e+07,3.289979e+08
59795,1.684897e+08,2.197475e+06,1.104713e+10
59796,2.151388e+05,1.452609e+06,7.185583e+06


In [66]:
df['predictedPercent'] = df['predicted']/df['goal']

In [67]:
df['actualPercent'] = df['actual']/df['goal']

In [75]:
# so things stop showing up in scientific notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [76]:
df.head(100)

Unnamed: 0,actual,predicted,goal,predictedPercent,actualPercent
0,4851432072.5831,18499733.91952,24664230324.55185,0.00075,0.1967
1,28041660.44956,515871.50677,2081312.46,0.24786,13.47307
2,821886022.2489,1949611.2723,797309020.00954,0.00245,1.03082
3,62822125.89978,609749.67432,66035623752.49438,1e-05,0.00095
4,487918884.4092,19489562.92835,409735290.38049,0.04757,1.19081
5,282661748.62642,6949468.82761,66687741.70735,0.10421,4.23859
6,68857129.82449,17763200.75069,66687741.70735,0.26636,1.03253
7,717473710.75353,21755274.62277,196811123.3065,0.11054,3.64549
8,173511964.9036,33513123.5608,144716565.64648,0.23158,1.19898
9,399566882.67252,1730794.04102,8007326781.02461,0.00022,0.0499


In [82]:
df.sort_values(by="actualPercent", ascending=False)

Unnamed: 0,actual,predicted,goal,predictedPercent,actualPercent
53244,5114386817.14458,53110.58859,0.93227,56969.22638,5485961803.60920
31464,4656057677.27076,53433.49888,1.00000,53433.49888,4656057677.27076
7137,2219730826.20441,100811.93095,1.00000,100811.93095,2219730826.20441
36886,40287.48771,348470.56416,0.00002,14039003569.28536,1623081665.62811
46169,428322704.56447,61470.55846,1.00000,61470.55846,428322704.56447
...,...,...,...,...,...
2841,4.93341,4987.61405,2634394093298117120.00000,0.00000,0.00000
27568,1.27163,29006.77916,2002313035480175616.00000,0.00000,0.00000
6574,1.00000,26678.58322,2634394093298117120.00000,0.00000,0.00000
57780,1.00000,3337210.92225,2634394093298117120.00000,0.00000,0.00000


In [77]:
mae = metrics.mean_absolute_error(df['actualPercent'],df['predictedPercent'])
mse = metrics.mean_squared_error(df['actualPercent'],df['predictedPercent'])
rmse = np.sqrt(metrics.mean_squared_error(df['actualPercent'],df['predictedPercent']))
r2 = metrics.r2_score(df['actualPercent'], df['predictedPercent'])

In [78]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  444113.25714209676
mse:  3532685261192755.0
rmse:  59436396.77161423
r2:  -2.53700867884389
