In [1]:
import pandas as pd
pd.set_option('max_columns', None)
import numpy as np
import ast
import gc
import time
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV

In [41]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [42]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords',
                'cast', 'crew']

In [43]:
def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x))
    return df

In [44]:
train = text_to_dict(train)
test = text_to_dict(test)

In [45]:
# Collection

train['collection_name'] = train['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
train['has_collection'] = train['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)

test['collection_name'] = test['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
test['has_collection'] = test['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)

train = train.drop(['belongs_to_collection'], axis=1)
test = test.drop(['belongs_to_collection'], axis=1)

gc.collect()

750

In [46]:
# Genres

list_of_genres = list(train['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

train['num_genres'] = train['genres'].apply(lambda x: len(x) if x != {} else 0)
train['all_genres'] = train['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_genres = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(10)]
for g in top_genres:
    train['genre_' + g] = train['all_genres'].apply(lambda x: 1 if g in x else 0)
    
test['num_genres'] = test['genres'].apply(lambda x: len(x) if x != {} else 0)
test['all_genres'] = test['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_genres:
    test['genre_' + g] = test['all_genres'].apply(lambda x: 1 if g in x else 0)    
    
train = train.drop(['genres'], axis=1)
test = test.drop(['genres'], axis=1)

gc.collect()

182

In [47]:
# Homepage

train['homepage'] = train['homepage'].fillna(0)
train['homepage'] = train['homepage'].apply(lambda x: 0 if x==0 else 1)

test['homepage'] = test['homepage'].fillna(0)
test['homepage'] = test['homepage'].apply(lambda x: 0 if x==0 else 1)

In [48]:
# Production companies

list_of_companies = list(train['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

train['num_companies'] = train['production_companies'].apply(lambda x: len(x) if x != {} else 0)
train['all_production_companies'] = train['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_companies = [m[0] for m in Counter([i for j in list_of_companies for i in j]).most_common(20)]
for g in top_companies:
    train['production_company_' + g] = train['all_production_companies'].apply(lambda x: 1 if g in x else 0)
    
test['num_companies'] = test['production_companies'].apply(lambda x: len(x) if x != {} else 0)
test['all_production_companies'] = test['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_companies:
    test['production_company_' + g] = test['all_production_companies'].apply(lambda x: 1 if g in x else 0)
    
train = train.drop(['production_companies', 'all_production_companies'], axis=1)
test = test.drop(['production_companies', 'all_production_companies'], axis=1)

gc.collect()

322

In [49]:
# Production Countries

list_of_countries = list(train['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

train['num_countries'] = train['production_countries'].apply(lambda x: len(x) if x != {} else 0)
train['all_countries'] = train['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_countries = [m[0] for m in Counter([i for j in list_of_countries for i in j]).most_common(25)]
for g in top_countries:
    train['production_country_' + g] = train['all_countries'].apply(lambda x: 1 if g in x else 0)
    
test['num_countries'] = test['production_countries'].apply(lambda x: len(x) if x != {} else 0)
test['all_countries'] = test['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_countries:
    test['production_country_' + g] = test['all_countries'].apply(lambda x: 1 if g in x else 0)

train = train.drop(['production_countries', 'all_countries'], axis=1)
test = test.drop(['production_countries', 'all_countries'], axis=1)

gc.collect()

392

In [50]:
# languages

list_of_languages = list(train['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

train['num_languages'] = train['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
train['all_languages'] = train['spoken_languages'].apply(lambda x: ' '.join(sorted([i['iso_639_1'] for i in x])) if x != {} else {})
top_languages = [m[0] for m in Counter([i for j in list_of_languages for i in j]).most_common(20)]
for g in top_languages:
    train['language_' + g] = train['all_languages'].apply(lambda x: 1 if g in x else 0)
    
test['num_languages'] = test['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
test['all_languages'] = test['spoken_languages'].apply(lambda x: ' '.join(sorted([i['iso_639_1'] for i in x])) if x != {} else {})
for g in top_languages:
    test['language_' + g] = test['all_languages'].apply(lambda x: 1 if g in x else 0)

train = train.drop(['spoken_languages', 'all_languages'], axis=1)
test = test.drop(['spoken_languages', 'all_languages'], axis=1)

gc.collect()

322

In [51]:
# Keywords

list_of_keywords = list(train['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

train['num_Keywords'] = train['Keywords'].apply(lambda x: len(x) if x != {} else 0)
train['all_Keywords'] = train['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_keywords = [m[0] for m in Counter([i for j in list_of_keywords for i in j]).most_common(20)]
for g in top_keywords:
    train['keyword_' + g] = train['all_Keywords'].apply(lambda x: 1 if g in x else 0)
    
test['num_Keywords'] = test['Keywords'].apply(lambda x: len(x) if x != {} else 0)
test['all_Keywords'] = test['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_keywords:
    test['keyword_' + g] = test['all_Keywords'].apply(lambda x: 1 if g in x else 0)
    
train = train.drop(['Keywords', 'all_Keywords'], axis=1)
test = test.drop(['Keywords', 'all_Keywords'], axis=1)

gc.collect()

322

In [52]:
# Cast

list_of_cast_names = list(train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
list_of_cast_gender = list(train['cast'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
list_of_cast_characters = list(train['cast'].apply(lambda x: [i['character'] for i in x] if x != {} else []).values)

train['num_cast'] = train['cast'].apply(lambda x: len(x) if x != {} else 0)
top_cast_names = [m[0] for m in Counter([i for j in list_of_cast_names for i in j]).most_common(15)]
for g in top_cast_names:
    train['cast_name_' + g] = train['cast'].apply(lambda x: 1 if g in x else 0)
    
train['gender_0'] = train['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
train['gender_1'] = train['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
train['gender_2'] = train['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

top_cast_characters = [m[0] for m in Counter([i for j in list_of_cast_characters for i in j]).most_common(10)]
for g in top_cast_characters:
    train['cast_character_' + g] = train['cast'].apply(lambda x: 1 if g in x else 0)
    
test['num_cast'] = test['cast'].apply(lambda x: len(x) if x != {} else 0)
for g in top_cast_names:
    test['cast_name_' + g] = test['cast'].apply(lambda x: 1 if g in x else 0)
    
test['gender_0'] = test['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
test['gender_1'] = test['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
test['gender_2'] = test['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

for g in top_cast_characters:
    test['cast_character_' + g] = test['cast'].apply(lambda x: 1 if g in x else 0)

train = train.drop(['cast'], axis=1)
test = test.drop(['cast'], axis=1)

gc.collect()

420

In [53]:
# Crew

list_of_crew_names = list(train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
list_of_crew_jobs = list(train['crew'].apply(lambda x: [i['job'] for i in x] if x != {} else []).values)
list_of_crew_genders = list(train['crew'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
list_of_crew_departments = list(train['crew'].apply(lambda x: [i['department'] for i in x] if x != {} else []).values)

train['num_crew'] = train['crew'].apply(lambda x: len(x) if x != {} else 0)
top_crew_names = [m[0] for m in Counter([i for j in list_of_crew_names for i in j]).most_common(15)]
for g in top_crew_names:
    train['crew_name_' + g] = train['crew'].apply(lambda x: 1 if g in x else 0)

train['genders_0'] = train['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
train['genders_1'] = train['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
train['genders_2'] = train['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

top_cast_characters = [m[0] for m in Counter([i for j in list_of_cast_characters for i in j]).most_common(15)]
for g in top_cast_characters:
    train['crew_character_' + g] = train['crew'].apply(lambda x: 1 if g in x else 0)

top_crew_jobs = [m[0] for m in Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)]
for j in top_crew_jobs:
    train['jobs_' + j] = train['crew'].apply(lambda x: sum([1 for i in x if i['job'] == j]))

top_crew_departments = [m[0] for m in Counter([i for j in list_of_crew_departments for i in j]).most_common(15)]
for j in top_crew_departments:
    train['departments_' + j] = train['crew'].apply(lambda x: sum([1 for i in x if i['department'] == j])) 
    
test['num_crew'] = test['crew'].apply(lambda x: len(x) if x != {} else 0)
for g in top_crew_names:
    test['crew_name_' + g] = test['crew'].apply(lambda x: 1 if g in x else 0)

test['genders_0'] = test['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
test['genders_1'] = test['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
test['genders_2'] = test['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

for g in top_cast_characters:
    test['crew_character_' + g] = test['crew'].apply(lambda x: 1 if g in x else 0)

for j in top_crew_jobs:
    test['jobs_' + j] = test['crew'].apply(lambda x: sum([1 for i in x if i['job'] == j]))

for j in top_crew_departments:
    test['departments_' + j] = test['crew'].apply(lambda x: sum([1 for i in x if i['department'] == j])) 

train = train.drop(['crew'], axis=1)
test = test.drop(['crew'], axis=1)

gc.collect()

434

In [54]:
train['log_revenue'] = np.log1p(train['revenue'])

train['log_budget'] = np.log1p(train['budget'])
test['log_budget'] = np.log1p(test['budget'])

In [55]:
list_of_lang = ['en', 'fr', 'ru', 'es', 'hi', 'ja', 'it', 'ko', 'cn', 'zh', 'de', 'ta']

train['original_language'] = train['original_language'].apply(lambda x: x if x in list_of_lang else 'Other')
test['original_language'] = test['original_language'].apply(lambda x: x if x in list_of_lang else 'Other')

In [56]:
list_of_status = ['Released', 'Post Production', 'Rumored']

train['status'] = train['status'].apply(lambda x: x if x in list_of_status else 'Other')
test['status'] = test['status'].apply(lambda x: x if x in list_of_status else 'Other')

In [57]:
for col in ['original_language', 'collection_name', 'all_genres', 'status']:
    le = LabelEncoder()
    le.fit(list(train[col].fillna('')) + list(test[col].fillna('')))
    train[col] = le.transform(train[col].fillna('').astype(str))
    test[col] = le.transform(test[col].fillna('').astype(str))

In [58]:
def fix_date(x):

    year = x.split('/')[2]
    if int(year) <= 19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year

In [59]:
test.loc[test['release_date'].isnull() == True, 'release_date'] = '01/01/98'

train['release_date'] = train['release_date'].apply(lambda x: fix_date(x))
test['release_date'] = test['release_date'].apply(lambda x: fix_date(x))

train['release_date'] = pd.to_datetime(train['release_date'])
test['release_date'] = pd.to_datetime(test['release_date'])

In [60]:
def process_date(df):
    
    date_parts = ['year', 'weekday', 'month', 'weekofyear', 'day', 'quarter']
    for part in date_parts:
        part_col = 'release_date' + "_" + part
        df[part_col] = getattr(df['release_date'].dt, part).astype(int)
    
    return df

In [61]:
train = process_date(train)
test = process_date(test)

In [62]:
for col in ['title', 'tagline', 'overview', 'original_title']:
    train['len_' + col] = train[col].fillna('').apply(lambda x: len(str(x)))
    train['words_' + col] = train[col].fillna('').apply(lambda x: len(str(x.split(' '))))
    train = train.drop(col, axis=1)
    
    test['len_' + col] = test[col].fillna('').apply(lambda x: len(str(x)))
    test['words_' + col] = test[col].fillna('').apply(lambda x: len(str(x.split(' '))))
    test = test.drop(col, axis=1)

In [63]:
y = train['log_revenue']
train = train.drop(['id', 'release_date', 'revenue', 'log_revenue', 'imdb_id', 'poster_path'], axis=1)
test = test.drop(['id', 'release_date', 'imdb_id', 'poster_path'], axis=1)

gc.collect()

216

In [64]:
train['budget_runtime_ratio'] = train['log_budget'] / train['runtime']
test['budget_runtime_ratio'] = test['log_budget'] / test['runtime']

train['budget_popularity_ratio'] = train['log_budget'] / train['popularity']
test['budget_popularity_ratio'] = test['log_budget'] / test['popularity']

train['releaseYear_popularity_ratio'] = train['release_date_year'] / train['popularity']
test['releaseYear_popularity_ratio'] = test['release_date_year'] / test['popularity']

train['releaseYear_popularity_ratio2'] = train['popularity'] / train['release_date_year']
test['releaseYear_popularity_ratio2'] = test['popularity'] / test['release_date_year']


train['meanRuntimeByYear'] = train.groupby('release_date_year')['runtime'].aggregate('mean')
test['meanRuntimeByYear'] = test.groupby('release_date_year')['runtime'].aggregate('mean')

train['meanPopularityByYear'] = train.groupby('release_date_year')['popularity'].aggregate('mean')
test['meanPopularityByYear'] = test.groupby('release_date_year')['popularity'].aggregate('mean')

train['meanBudgetByYear'] = train.groupby('release_date_year')['log_budget'].aggregate('mean')
test['meanBudgetByYear'] = test.groupby('release_date_year')['log_budget'].aggregate('mean')


In [65]:
n_fold = 10
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

In [66]:
def train_model(X, X_test, y, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, model=None):

    oof = np.zeros(X.shape[0])
    prediction = np.zeros(X_test.shape[0])
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        if model_type == 'sklearn':
            X_train, X_valid = X[train_index], X[valid_index]
        else:
            X_train, X_valid = X.values[train_index], X.values[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse',
                    verbose=1000, early_stopping_rounds=200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=30000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test.values), ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = mean_squared_error(y_valid, y_pred_valid)
            
            y_pred = model.predict(X_test)
            
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric='RMSE', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5)
        
        prediction += y_pred    
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction
    
    else:
        return oof, prediction

In [67]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.1)

In [68]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 10,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
oof_lgb, prediction_lgb, = train_model(train, test, y, params=params, model_type='lgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 12:54:27 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[434]	training's rmse: 1.55084	valid_1's rmse: 2.08688
Fold 1 started at Sat May 25 12:54:29 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.30121	valid_1's rmse: 1.85964
Early stopping, best iteration is:
[1275]	training's rmse: 1.19034	valid_1's rmse: 1.85335
Fold 2 started at Sat May 25 12:54:33 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.30045	valid_1's rmse: 1.88463
[2000]	training's rmse: 0.965538	valid_1's rmse: 1.87021
Early stopping, best iteration is:
[1986]	training's rmse: 0.969722	valid_1's rmse: 1.86949
Fold 3 started at Sat May 25 12:54:38 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.28524	valid_1's rmse: 1.91361
Early stopping, best iteration is:
[878]	training's rmse: 1.33736	valid_1's rm

In [69]:
def new_features(df):
    df['budget_to_popularity'] = df['budget'] / df['popularity']
    df['budget_to_runtime'] = df['budget'] / df['runtime']
    
    # some features from https://www.kaggle.com/somang1418/happy-valentines-day-and-keep-kaggling-3
    df['_budget_year_ratio'] = df['budget'] / (df['release_date_year'] * df['release_date_year'])
    df['_releaseYear_popularity_ratio'] = df['release_date_year'] / df['popularity']
    df['_releaseYear_popularity_ratio2'] = df['popularity'] / df['release_date_year']
    
    df['runtime_to_mean_year'] = df['runtime'] / df.groupby("release_date_year")["runtime"].transform('mean')
    df['popularity_to_mean_year'] = df['popularity'] / df.groupby("release_date_year")["popularity"].transform('mean')
    df['budget_to_mean_year'] = df['budget'] / df.groupby("release_date_year")["budget"].transform('mean')
        
    return df

In [70]:
X = new_features(train)
X_test = new_features(test)

In [72]:
oof_lgb, prediction_lgb, = train_model(X, X_test, y, params=params, model_type='lgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 12:56:56 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.24313	valid_1's rmse: 2.05893
Early stopping, best iteration is:
[1084]	training's rmse: 1.20801	valid_1's rmse: 2.05614
Fold 1 started at Sat May 25 12:56:59 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.27244	valid_1's rmse: 1.8683
Early stopping, best iteration is:
[1352]	training's rmse: 1.12702	valid_1's rmse: 1.86274
Fold 2 started at Sat May 25 12:57:02 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.27507	valid_1's rmse: 1.88514
Early stopping, best iteration is:
[1453]	training's rmse: 1.10225	valid_1's rmse: 1.8713
Fold 3 started at Sat May 25 12:57:06 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.26153	valid_1's rmse: 1.88034
Early stopping, best iteration is:
[969]	training's rmse: 1.27348	valid_1's rmse:

In [79]:
def top_cols_interaction(df):
    df['budget_to_year'] = df['budget'] / df['release_date_year']
    df['budget_to_mean_year_to_year'] = df['budget_to_mean_year'] / df['release_date_year']
    df['popularity_to_mean_year_to_log_budget'] = df['popularity_to_mean_year'] / df['log_budget']
    df['year_to_log_budget'] = df['release_date_year'] / df['log_budget']
    df['budget_to_runtime_to_year'] = df['budget_to_runtime'] / df['release_date_year']
    df['all_genres_to_popularity_to_mean_year'] = df['all_genres'] / df['popularity_to_mean_year']
    
    return df

In [80]:
X = top_cols_interaction(X)
X_test = top_cols_interaction(X_test)

In [81]:
X = X.replace([np.inf, -np.inf], 0).fillna(0)
X_test = X_test.replace([np.inf, -np.inf], 0).fillna(0)

In [83]:
trainAdditionalFeatures = pd.read_csv('data/TrainAdditionalFeatures.csv.xls')
testAdditionalFeatures = pd.read_csv('data/TestAdditionalFeatures.csv.xls')

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
X['imdb_id'] = train['imdb_id']
X_test['imdb_id'] = test['imdb_id']
del train, test

X = pd.merge(X, trainAdditionalFeatures, how='left', on=['imdb_id'])
X_test = pd.merge(X_test, testAdditionalFeatures, how='left', on=['imdb_id'])

X = X.drop(['imdb_id'], axis=1)
X_test = X_test.drop(['imdb_id'], axis=1)

In [86]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 9,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
oof_lgb, prediction_lgb, = train_model(X, X_test, y, params=params, model_type='lgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 13:05:52 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[699]	training's rmse: 1.06241	valid_1's rmse: 1.99571
Fold 1 started at Sat May 25 13:05:55 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[654]	training's rmse: 1.12796	valid_1's rmse: 1.83412
Fold 2 started at Sat May 25 13:05:58 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 0.899502	valid_1's rmse: 1.80697
Early stopping, best iteration is:
[1073]	training's rmse: 0.860726	valid_1's rmse: 1.80482
Fold 3 started at Sat May 25 13:06:03 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[693]	training's rmse: 1.08194	valid_1's rmse: 1.7774
Fold 4 started at Sat May 25 13:06:06 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 0.872187	valid_1's rmse: 1.9

In [87]:
xgb_params = {'eta': 0.01,
              'objective': 'reg:linear',
              'max_depth': 7,
              'subsample': 0.8,
              'colsample_bytree': 0.8,
              'eval_metric': 'rmse',
              'seed': 11,
              'silent': True}
oof_xgb, prediction_xgb = train_model(X, X_test, y, params=xgb_params, model_type='xgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 13:07:05 2019
[0]	train-rmse:15.5784	valid_data-rmse:15.8741
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.920344	valid_data-rmse:1.95129
[1000]	train-rmse:0.549238	valid_data-rmse:1.93504
Stopping. Best iteration:
[1200]	train-rmse:0.449318	valid_data-rmse:1.93223

Fold 1 started at Sat May 25 13:07:24 2019
[0]	train-rmse:15.6063	valid_data-rmse:15.6201
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.927478	valid_data-rmse:1.83148
[1000]	train-rmse:0.556894	valid_data-rmse:1.79979
Stopping. Best iteration:
[1192]	train-rmse:0.459802	valid_data-rmse:1.79517

Fold 2 started at Sat May 25 13:07:43 2019
[0]	train-rmse:15.5967	valid_data-rmse:15.7093
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.941581	valid_data-rmse:1.79893
[1000]	train-rmse:0.560481	valid_data-rmse:1.77605
[1500]	train-rmse:0.338126	valid_data-rmse:1.77178
Stopping. Best iteration:
[1333]	train-rmse:0.400279	valid_data-rmse:1.77063

Fold 3 started at Sat May 25 13:08:05 2019
[0]	train-rmse:15.6269	valid_data-rmse:15.4328
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.944914	valid_data-rmse:1.79137
Stopping. Best iteration:
[788]	train-rmse:0.697622	valid_data-rmse:1.77166

Fold 4 started at Sat May 25 13:08:19 2019
[0]	train-rmse:15.6047	valid_data-rmse:15.6369
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.929903	valid_data-rmse:2.00895
[1000]	train-rmse:0.556984	valid_data-rmse:1.99629
Stopping. Best iteration:
[1117]	train-rmse:0.49732	valid_data-rmse:1.99536

Fold 5 started at Sat May 25 13:08:37 2019
[0]	train-rmse:15.6142	valid_data-rmse:15.5513
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.957759	valid_data-rmse:1.74214
[1000]	train-rmse:0.574447	valid_data-rmse:1.73525
Stopping. Best iteration:
[947]	train-rmse:0.606226	valid_data-rmse:1.7327

Fold 6 started at Sat May 25 13:08:51 2019
[0]	train-rmse:15.604	valid_data-rmse:15.6387
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.913619	valid_data-rmse:2.25189
Stopping. Best iteration:
[619]	train-rmse:0.793321	valid_data-rmse:2.24835

Fold 7 started at Sat May 25 13:09:02 2019
[0]	train-rmse:15.6337	valid_data-rmse:15.3739
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.922839	valid_data-rmse:2.12562
[1000]	train-rmse:0.55346	valid_data-rmse:2.1161
Stopping. Best iteration:
[1220]	train-rmse:0.448909	valid_data-rmse:2.11365

Fold 8 started at Sat May 25 13:09:20 2019
[0]	train-rmse:15.6194	valid_data-rmse:15.5075
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.934895	valid_data-rmse:2.05317
Stopping. Best iteration:
[566]	train-rmse:0.866548	valid_data-rmse:2.05001

Fold 9 started at Sat May 25 13:09:30 2019
[0]	train-rmse:15.5934	valid_data-rmse:15.7349
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[500]	train-rmse:0.91007	valid_data-rmse:2.07705
Stopping. Best iteration:
[739]	train-rmse:0.713399	valid_data-rmse:2.05963

CV mean score: 1.9469, std: 0.1659.


In [91]:
cat_params = {'learning_rate': 0.002,
              'depth': 5,
              'l2_leaf_reg': 10,
              # 'bootstrap_type': 'Bernoulli',
              'colsample_bylevel': 0.8,
              'bagging_temperature': 0.2,
              'metric_period': 500,
              'od_type': 'Iter',
              'od_wait': 100,
              'random_seed': 11,
              'allow_writing_files': False}
oof_cat, prediction_cat = train_model(X, X_test, y, params=cat_params, model_type='cat')

Fold 0 started at Sat May 25 13:12:08 2019




Fold 1 started at Sat May 25 13:16:45 2019




Fold 2 started at Sat May 25 13:19:15 2019




Fold 3 started at Sat May 25 13:24:57 2019




Fold 4 started at Sat May 25 13:26:57 2019




Fold 5 started at Sat May 25 13:29:06 2019




Fold 6 started at Sat May 25 13:32:32 2019




Fold 7 started at Sat May 25 13:35:49 2019




Fold 8 started at Sat May 25 13:36:46 2019




Fold 9 started at Sat May 25 13:38:20 2019




CV mean score: 1.9843, std: 0.2036.


In [92]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
oof_lgb_1, prediction_lgb_1 = train_model(X, X_test, y, params=params, model_type='lgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 13:42:15 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.29934	valid_1's rmse: 1.99249
Early stopping, best iteration is:
[1234]	training's rmse: 1.20994	valid_1's rmse: 1.99047
Fold 1 started at Sat May 25 13:42:18 2019
Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 1.31271	valid_1's rmse: 1.80402
Early stopping, best iteration is:
[1060]	training's rmse: 1.28898	valid_1's rmse: 1.80092
Fold 2 started at Sat May 25 13:42:20 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[709]	training's rmse: 1.44033	valid_1's rmse: 1.81862
Fold 3 started at Sat May 25 13:42:23 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[680]	training's rmse: 1.44691	valid_1's rmse: 1.77824
Fold 4 started at Sat May 25 13:42:25 2019
Training until validation scores don't improve for 200 roun

In [93]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 7,
         'learning_rate': 0.02,
         "boosting": "gbdt",
         "feature_fraction": 0.7,
         "bagging_freq": 5,
         "bagging_fraction": 0.7,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
oof_lgb_2, prediction_lgb_2 = train_model(X, X_test, y, params=params, model_type='lgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 13:43:50 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[395]	training's rmse: 1.1989	valid_1's rmse: 1.98692
Fold 1 started at Sat May 25 13:43:52 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[298]	training's rmse: 1.30374	valid_1's rmse: 1.86352
Fold 2 started at Sat May 25 13:43:53 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[427]	training's rmse: 1.17528	valid_1's rmse: 1.8026
Fold 3 started at Sat May 25 13:43:55 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[315]	training's rmse: 1.30362	valid_1's rmse: 1.78202
Fold 4 started at Sat May 25 13:43:56 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[639]	training's rmse: 0.930049	valid_1's rmse: 2.00202
Fold 5 started at Sat

In [94]:
train_stack = np.vstack([oof_lgb, oof_xgb, oof_cat, oof_lgb_1, oof_lgb_2]).transpose()
train_stack = pd.DataFrame(train_stack, columns=['lgb', 'xgb', 'cat', 'lgb_1', 'lgb_2'])
test_stack = np.vstack([prediction_lgb, prediction_xgb, prediction_cat, prediction_lgb_1, prediction_lgb_2]).transpose()
test_stack = pd.DataFrame(test_stack, columns=['lgb', 'xgb', 'cat', 'lgb_1', 'lgb_2'])

In [96]:
params = {'num_leaves': 8,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 3,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
oof_lgb_stack, prediction_lgb_stack, = train_model(train_stack, test_stack, y, params=params, model_type='lgb', plot_feature_importance=False)

Fold 0 started at Sat May 25 13:45:10 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[412]	training's rmse: 1.86972	valid_1's rmse: 1.95979
Fold 1 started at Sat May 25 13:45:10 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[267]	training's rmse: 1.91782	valid_1's rmse: 1.82659
Fold 2 started at Sat May 25 13:45:10 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[323]	training's rmse: 1.91217	valid_1's rmse: 1.76362
Fold 3 started at Sat May 25 13:45:10 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[320]	training's rmse: 1.90529	valid_1's rmse: 1.80979
Fold 4 started at Sat May 25 13:45:11 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[231]	training's rmse: 1.9104	valid_1's rmse: 2.05256
Fold 5 started at Sat

In [99]:
model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), scoring='neg_mean_squared_error', cv=folds)
oof_rcv_stack, prediction_rcv_stack = train_model(train_stack.values, test_stack.values, y, params=None, model_type='sklearn', model=model)

Fold 0 started at Sat May 25 13:46:00 2019
Fold 1 started at Sat May 25 13:46:00 2019
Fold 2 started at Sat May 25 13:46:00 2019
Fold 3 started at Sat May 25 13:46:01 2019
Fold 4 started at Sat May 25 13:46:01 2019
Fold 5 started at Sat May 25 13:46:01 2019
Fold 6 started at Sat May 25 13:46:01 2019
Fold 7 started at Sat May 25 13:46:01 2019
Fold 8 started at Sat May 25 13:46:01 2019
Fold 9 started at Sat May 25 13:46:01 2019
CV mean score: 1.9404, std: 0.1716.


In [100]:
sub = pd.read_csv('data/sample_submission.csv')
sub['revenue'] = np.expm1(prediction_lgb)
sub.to_csv("lgb.csv", index=False)
sub['revenue'] = np.expm1((prediction_lgb + prediction_xgb) / 2)
sub.to_csv("blend.csv", index=False)
sub['revenue'] = np.expm1((prediction_lgb + prediction_xgb + prediction_cat) / 3)
sub.to_csv("blend1.csv", index=False)
sub['revenue'] = np.expm1((prediction_lgb + prediction_xgb + prediction_cat + prediction_lgb_1) / 4)
sub.to_csv("blend2.csv", index=False)
sub['revenue'] = np.expm1((prediction_lgb + prediction_xgb + prediction_cat + prediction_lgb_1 + prediction_lgb_2) / 5)
sub.to_csv("blend3.csv", index=False)

sub['revenue'] = prediction_lgb_stack
sub.to_csv("stack_lgb.csv", index=False)
sub['revenue'] = prediction_rcv_stack
sub.to_csv("stack_rcv.csv", index=False)

In [2]:
blend = pd.read_csv('blend3.csv')
ens = pd.read_csv('submission_ens.csv.xls')

In [5]:
extend_sub = (blend['revenue'] + ens['revenue']) / 2

In [9]:
sub = pd.read_csv('data/sample_submission.csv')
sub['revenue'] = extend_sub
sub.to_csv("new_extend_sub.csv", index=False)