In [0]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

In [0]:
# RMSLE in natural log 
def my_loss(pred, actual):
  assert len(pred) == len(actual)
  error2 = [(math.log(pred[i] + 1) - math.log(actual[i] - 1))**2 for i in range(len(pred))]
  result = np.sqrt(np.sum(error2) / len(pred))
  return result 

## Load the data

In [0]:
# Import the all dataset
tr = pd.read_csv('train_2.csv')

In [47]:
tr.shape

(3000, 353)

In [0]:
y = tr.revenue_log 
X = tr.drop('revenue_log', axis = 1)

In [49]:
X.shape

(3000, 352)

In [50]:
te = pd.read_csv('test_2.csv')
te.shape

(4398, 352)

In [51]:
X.shape

(3000, 352)

## Additional preprocessing

In [0]:
# Additional preprocessing (if neccesssary)
# Dimension Reduction 



In [0]:
# Split into train and Valid set
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size = .2, random_state = 36)

In [53]:
print("The size of the train set ", X_tr.shape)
print("The size of the validation set ", X_val.shape)
print("The size of the test set ", te.shape)

The size of the train set  (2400, 352)
The size of the validation set  (600, 352)
The size of the test set  (4398, 352)


In [0]:
del tr, X, y, 

In [55]:
X_tr.columns

Index(['runtime_h', 'runtime_m', 'runtime_log', 'budget_log', 'popularity_log',
       'status', 'is_collection', 'is_homepage', 'n_genres', 'n_prod_comp',
       ...
       'crew_Camera', 'crew_Costume & Make-Up', 'crew_Crew', 'crew_Directing',
       'crew_Editing', 'crew_Lighting', 'crew_Production', 'crew_Sound',
       'crew_Visual Effects', 'crew_Writing'],
      dtype='object', length=352)

### 1. Elastic-Net

In [0]:
X_tr.fillna(-1, inplace = True)
X_val.fillna(-1, inplace = True)
te.fillna(-1, inplace = True)

In [0]:
from sklearn.linear_model import ElasticNet 
model_el = ElasticNet(alpha = .03, l1_ratio = .05, max_iter = 2000)
model_el.fit(X_tr, y_tr)

ElasticNet(alpha=0.03, copy_X=True, fit_intercept=True, l1_ratio=0.05,
      max_iter=2000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [0]:
pred_el = model_el.predict(X_val)

In [0]:
np.sqrt(mean_squared_error(y_val, pred_el))

2.1266072116373356

In [0]:
# 1st try: 2.1347
# 2nd try: 2.1266

In [0]:
my_loss(pred_el, y)

### 2. Xgboost

In [267]:
import xgboost as xgb
model_xg = xgb.XGBRegressor(objective = 'reg:linear',
                            n_estimators = 10000, 
                            max_depth = 7, 
                            learning_rate = 0.005,
                            early_stopping_rounds = 500,
                            gamma = 1.5, 
                            subsample = 0.8,   
                            colsample_bytree = 0.7,
                            colsample_bylevel = 0.9, 
                            silent = True)

model_xg.fit(X_tr, y_tr)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.7, early_stopping_rounds=500, gamma=1.5,
       importance_type='gain', learning_rate=0.005, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=10000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [268]:
pred_xg = model_xg.predict(X_val)
np.sqrt(mean_squared_error(y_val, pred_xg))

1.8978116369411362

In [0]:
# 1st try: 1.9202
# 2nd try: 1.9128
# 3nd try: 1.8978

In [0]:
my_loss(pred_xg, y_val)

### 3. Catboost

In [219]:
!pip install catboost



In [0]:
from catboost import CatBoostRegressor

In [248]:
model_cb = CatBoostRegressor(iterations = 10000,
                             depth = 7,   
                             eval_metric = 'RMSE',
                             learning_rate = .01,   
                             colsample_bylevel = 0.7,  
                             bagging_temperature = 0.2,   
                             early_stopping_rounds = 300,   
                             logging_level = 'Silent')

model_cb.fit(X_tr, y_tr, eval_set = (X_val, y_val), use_best_model = True)

<catboost.core.CatBoostRegressor at 0x7f74a0d90710>

In [249]:
pred_cb = model_cb.predict(X_val)
np.sqrt(mean_squared_error(y_val, pred_cb))

1.8753539178200316

In [0]:
# 1st try: 1.9218
# 2nd try: 1.8779
# 3rd try: 1.8753

### 4. Lightgbm

In [0]:
!pip install lightgbm --install-option=--gpu

In [0]:
# Installation 
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile

In [0]:
import lightgbm as lgb

In [206]:
tr_data = lgb.Dataset(X_tr, label = y_tr)
val_data = lgb.Dataset(X_val, label = y_val)

params = {'objective' : 'regression',
          'num_iterations' : 10000, 
          'max_depth' : 9,
          'num_leaves' : 100,
          'learning_rate': 0.005,
          'metric' : 'rmse',
          'min_data_in_leaf' : 100,
          'colsample_bytree': 0.9,
          'subsample_freq': 1,
          'lambda_l1' : 0.1,
          'lambda_l2' : 0.3,
          'subsample' : 0.8, 
          'verbose' : -1}

hist = {}
model_lg = lgb.train(params, tr_data, 
                     valid_sets = [val_data],
                     verbose_eval = -1, 
                     early_stopping_rounds = 500,
                     callbacks = [lgb.record_evaluation(hist)])



Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[6921]	valid_0's rmse: 1.91857


In [207]:
pred_lg = model_lg.predict(X_val, num_iteration = model_lg.best_iteration)
np.sqrt(mean_squared_error(y_val, pred_lg))

1.918566125430695

In [0]:
# 1st try: 
# 2nd try: 1.9138

### 5. Keras

In [0]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

In [77]:
print(X_tr.shape[1])
print(X_tr.shape[1] /2)

352
176.0


In [115]:
# initialize the model
model_ke = Sequential()

# create hidden layers
model_ke.add(Dense(input_dim = 352, output_dim = 126, 
                activation = 'relu'))
model_ke.add(Dense(output_dim = 64, activation = 'relu'))
model_ke.add(Dense(output_dim = 16, activation = 'relu'))
model_ke.add(Dense(output_dim = 1)) 

# compile: stochastic gradient descent
model_ke.compile(optimizer = 'adam', 
              loss = 'mse', 
              metrics = ['mean_squared_logarithmic_error'])

  """
  
  import sys
  


In [0]:
# early stopper 
early_stopper = EarlyStopping(patience = 5)

r = model_ke.fit(X_tr, y_tr, 
                 batch_size = 500, 
                 nb_epoch = 1000,
                 callbacks = [early_stopper])

In [120]:
# prediction 
pred_ke = model_ke.predict(X_val)
np.sqrt(mean_squared_error(y_val, pred_ke))

16.320093485738738

In [0]:
# 1st try : 2.9460
# 2nd try : 

In [0]:
# plot the error
plt.plot(r.history['loss'], label = 'train loss')
plt.plot(r.history['val_loss'], label = 'test loss')
plt.legend()
plt.show()

plt.plot(r.history['mean_squared_logarithmic_error'], label = 'train mse')
plt.plot(r.history['val_mean_squared_logarithmic_error'], label = 'test mse')
plt.legend()
plt.show()

## Evaluation

In [0]:
df_pred = pd.DataFrame()

In [0]:
df_pred['el'] = model_el.predict(te)

In [0]:
df_pred['ke'] = model_ke.predict(te)

In [0]:
df_pred['xg'] = model_xg.predict(te)
df_pred['cb'] = model_cb.predict(te)
df_pred['lg'] = model_lg.predict(te)

In [0]:
#df_pred['final'] = np.expm1(.2*df_pred.el + .5*df_pred.cb + .3*df_pred.xg)
df_pred['final'] = np.expm1(.4*df_pred.cb + .4*df_pred.xg + .3*df_pred.lg)

In [0]:
df_pred.head()

Unnamed: 0,xg,cb,final,el
0,16.05217,15.547183,5807170.0,14.926807
1,13.68719,14.855867,1817424.0,14.394201
2,16.181671,15.821494,6957021.0,14.950067
3,15.788505,15.791989,7713922.0,16.129956
4,13.33971,13.354093,846637.8,14.850349


### Submission

In [273]:
sub = pd.read_csv('sample_submission.csv')
sub.shape

(4398, 2)

In [0]:
sub['revenue'] = df_pred.final

In [0]:
from google.colab import files

sub.to_csv('sub.csv', index = False)
files.download('sub.csv')

In [0]:
!kaggle competitions submit -c tmdb-box-office-prediction -f sub.csv -m "Message"