In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import PoissonRegressor, TweedieRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn import linear_model

In [2]:
pd.set_option('display.max_columns', None)
np.set_printoptions(suppress=True)

## Load Data
Load both train and test datasets after preprocessing.

In [3]:
df_train_full = pd.read_csv('data/ift6758-a20/train_clean.csv')

In [4]:
df_test_full = pd.read_csv('data/ift6758-a20/test_clean.csv')

In [5]:
df_train_full.head()

Unnamed: 0.1,Id,User Name,Personal URL,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location,Location Public Visibility,User Language,Profile Creation Timestamp,User Time Zone,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image,Num of Profile Likes,Unnamed: 0,image_hex_color,image_rgb_color,num_profile_likes_log,user_name_length,personal_url_binary,cover_image_binary,verification_status_not verified,verification_status_pending,verification_status_verified,profile_text_color,profile_text_color_aqua,profile_text_color_black,profile_text_color_blue,profile_text_color_fuchsia,profile_text_color_gray,profile_text_color_green,profile_text_color_lime,profile_text_color_maroon,profile_text_color_navy,profile_text_color_olive,profile_text_color_purple,profile_text_color_red,profile_text_color_silver,profile_text_color_teal,profile_text_color_white,profile_text_color_yellow,profile_page_color,profile_page_color_aqua,profile_page_color_black,profile_page_color_blue,profile_page_color_fuchsia,profile_page_color_gray,profile_page_color_green,profile_page_color_lime,profile_page_color_maroon,profile_page_color_navy,profile_page_color_olive,profile_page_color_purple,profile_page_color_red,profile_page_color_silver,profile_page_color_teal,profile_page_color_white,profile_page_color_yellow,profile_theme_color,profile_theme_color_aqua,profile_theme_color_black,profile_theme_color_blue,profile_theme_color_fuchsia,profile_theme_color_gray,profile_theme_color_green,profile_theme_color_lime,profile_theme_color_maroon,profile_theme_color_navy,profile_theme_color_olive,profile_theme_color_purple,profile_theme_color_red,profile_theme_color_silver,profile_theme_color_teal,profile_theme_color_white,profile_theme_color_yellow,profile_view_size_custom,utc_offset,utc_offset__-10800.0,utc_offset__-14400.0,utc_offset__-18000.0,utc_offset__-21600.0,utc_offset__-25200.0,utc_offset__-28800.0,utc_offset__-36000.0,utc_offset__-39600.0,utc_offset__-7200.0,utc_offset__10800.0,utc_offset__14400.0,utc_offset__18000.0,utc_offset__19800.0,utc_offset__25200.0,utc_offset__28800.0,utc_offset__32400.0,utc_offset__3600.0,utc_offset__36000.0,utc_offset__39600.0,utc_offset__46800.0,utc_offset__7200.0,utc_offset__other,utc_offset__unk,location_clean,location_top,location_top_brazil,location_top_france,location_top_india,location_top_indonesia,location_top_mexico,location_top_other,location_top_spain,location_top_turkey,location_top_united kingdom,location_top_united states,location_top_worldwide,location_binary,location_public_binary,user_language,user_lang_ar,user_lang_de,user_lang_en,user_lang_en-gb,user_lang_es,user_lang_fr,user_lang_id,user_lang_it,user_lang_ja,user_lang_ko,user_lang_nl,user_lang_other,user_lang_pl,user_lang_pt,user_lang_ru,user_lang_tr,profile_creation_timestamp,profile_creation_year,profile_creation_month,profile_creation_day,profile_creation_hour,profile_creation_minute,profile_creation_second,user_timezone,user_timezone_abu dhabi,user_timezone_alaska,user_timezone_america/new_york,user_timezone_amsterdam,user_timezone_arizona,user_timezone_athens,user_timezone_atlantic time (canada),user_timezone_baghdad,user_timezone_bangkok,user_timezone_beijing,user_timezone_belgrade,user_timezone_berlin,user_timezone_bern,user_timezone_bogota,user_timezone_brasilia,user_timezone_brussels,user_timezone_buenos aires,user_timezone_cairo,user_timezone_caracas,user_timezone_casablanca,user_timezone_central america,user_timezone_central time (us & canada),user_timezone_chennai,user_timezone_copenhagen,user_timezone_dublin,user_timezone_eastern time (us & canada),user_timezone_greenland,user_timezone_hanoi,user_timezone_hawaii,user_timezone_hong kong,user_timezone_irkutsk,user_timezone_islamabad,user_timezone_istanbul,user_timezone_jakarta,user_timezone_jerusalem,user_timezone_karachi,user_timezone_kuala lumpur,user_timezone_kuwait,user_timezone_kyiv,user_timezone_lima,user_timezone_lisbon,user_timezone_ljubljana,user_timezone_london,user_timezone_madrid,user_timezone_melbourne,user_timezone_mexico city,user_timezone_moscow,user_timezone_mountain time (us & canada),user_timezone_mumbai,user_timezone_muscat,user_timezone_nairobi,user_timezone_new delhi,user_timezone_other,user_timezone_pacific time (us & canada),user_timezone_paris,user_timezone_prague,user_timezone_pretoria,user_timezone_quito,user_timezone_riyadh,user_timezone_rome,user_timezone_santiago,user_timezone_seoul,user_timezone_singapore,user_timezone_stockholm,user_timezone_sydney,user_timezone_tokyo,user_timezone_unk,user_timezone_volgograd,user_timezone_warsaw,user_timezone_west central africa,avg_daily_profile_visit_duration,num_followers_log,num_people_following_log,num_status_updates_log,num_direct_messages_log,avg_daily_profile_clicks_log,profile_category_business,profile_category_celebrity,profile_category_government,profile_category_unknown,image_html_color,user_name_length_norm,profile_creation_year_norm,profile_creation_month_norm,profile_creation_day_norm,profile_creation_hour_norm,profile_creation_minute_norm,profile_creation_second_norm,num_followers_log_norm,num_people_following_log_norm,num_status_updates_log_norm,num_direct_messages_log_norm,avg_daily_profile_visit_duration_norm,avg_daily_profile_clicks_log_norm,user_name_length_bins,utc_offset_bins,user_language_bins,user_timezone_bins,location_clean_bins,image_html_color_bins,profile_creation_day_bins,profile_creation_hour_bins
0,AL85S14OMDPF01I9,Mf9vfld4Vfe,,Set,Verified,db1a2c,eaf0f2,e70409,False,39600.0,,Enabled,en,Thu Nov 27 05:24:59 +0000 2008,Sydney,95763,4289,30809,873,business,14.792,1.5761,AL85S14OMDPF01I9.png,2815,2735,#596491,[ 89 100 145],7.943073,11,0,1,0,0,1,red,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,red,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,39600.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,,other,0,0,0,0,0,1,0,0,0,0,0,0,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2008-11-27 05:24:59+00:00,2008,11,27,5,24,59,Sydney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,14.792,5.738042,4.197058,5.173462,3.419253,0.81334,1,0,0,0,slategray,0.692308,0.181818,0.909091,0.866667,0.217391,0.40678,1.0,0.564874,0.607399,0.631287,0.456199,0.29896,0.441974,1,5,6,5,0.0,4,2,1
1,HI11QOPD7BLJTO7Q,xl9gaGN0hxM_,,Set,Verified,0099cc,f6ffd1,fff04d,False,,mumbai,Enabled,en,Fri Jan 15 18:00:46 +0000 2010,,1018746,289,8150,290,unknown,8.183,11.2782,HI11QOPD7BLJTO7Q.png,1242,4386,#92928c,[146 146 140],7.125283,12,0,1,0,0,1,teal,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,yellow,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,unk,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,india,india,0,0,1,0,0,0,0,0,0,0,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2010-01-15 18:00:46+00:00,2010,1,15,18,0,46,unk,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,8.183,6.918032,2.890372,4.513903,2.892003,1.472083,0,0,0,1,gray,0.769231,0.363636,0.0,0.466667,0.782609,0.0,0.779661,0.710928,0.418295,0.538353,0.385853,0.163061,0.830993,1,3,6,4,4.0,4,1,2
2,JS49LP5P72RI1OQB,d_uiMm,,Set,Not verified,1fc2de,efefef,1fc2de,False,-18000.0,NYC + 70 Countries Worldwide,Enabled,en,Fri Oct 02 20:15:06 +0000 2009,Central Time (US & Canada),13444,1876,4698,227,unknown,31.823,0.5725,JS49LP5P72RI1OQB.png,1559,4914,#9bb8c1,[155 184 193],7.352441,6,0,1,1,0,0,aqua,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,aqua,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-18000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,nyc + countries worldwide,other,0,0,0,0,0,1,0,0,0,0,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2009-10-02 20:15:06+00:00,2009,10,2,20,15,6,Central Time (US & Canada),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31.823,4.761732,3.791274,4.24193,2.776738,0.563401,0,0,0,1,darkgray,0.307692,0.272727,0.818182,0.033333,0.869565,0.254237,0.101695,0.44403,0.548673,0.500032,0.370474,0.649161,0.294373,1,4,6,5,4.0,4,2,2
3,S0GDSC09MACCLBJP,hfylaRr,https://blob/e/g9pex_vS.com,Not set,Verified,050000,616161,00090a,False,-14400.0,"Indianapolis, In",Enabled,en,Thu Feb 19 14:37:22 +0000 2009,Eastern Time (US & Canada),339168,1148,53216,4035,business,23.052,4.0265,S0GDSC09MACCLBJP.png,6342,6997,#8e3f40,[142 63 64],8.755107,7,1,0,0,0,1,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,gray,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-14400.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,united states,united states,0,0,0,0,0,0,0,0,0,1,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2009-02-19 14:37:22+00:00,2009,2,19,14,37,22,Eastern Time (US & Canada),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23.052,6.368841,3.551975,5.445383,4.167001,1.100815,1,0,0,0,sienna,0.384615,0.272727,0.090909,0.6,0.608696,0.627119,0.372881,0.642951,0.514042,0.669601,0.555964,0.468807,0.611741,1,5,6,5,5.0,4,2,2
4,CRSEMK4QER6LDJSA,hRR1sDGlz5,https://blob/v/Szeo.h4/.com,Set,Not verified,58424d,f7f7f7,000000,False,-18000.0,"777 Beach Blvd. Biloxi, MS",Enabled,en,Tue Mar 31 13:27:52 +0000 2009,Central Time (US & Canada),9215,93,3271,130,unknown,8.418,3.9229,CRSEMK4QER6LDJSA.png,1078,3287,#906778,[144 103 120],6.98379,10,1,1,1,0,0,gray,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-18000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,united states,united states,0,0,0,0,0,0,0,0,0,1,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2009-03-31 13:27:52+00:00,2009,3,31,13,27,52,Central Time (US & Canada),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.418,4.574657,2.364964,4.063759,2.517838,1.092135,0,0,0,1,gray,0.615385,0.272727,0.181818,1.0,0.565217,0.457627,0.881356,0.420875,0.342258,0.474927,0.335932,0.167894,0.606616,1,4,6,5,5.0,4,2,2


In [6]:
df_test_full.head()

Unnamed: 0.1,Id,User Name,Personal URL,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location,Location Public Visibility,User Language,Profile Creation Timestamp,User Time Zone,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image,Unnamed: 0,image_hex_color,image_rgb_color,user_name_length,personal_url_binary,cover_image_binary,verification_status_not verified,verification_status_pending,verification_status_verified,profile_text_color,profile_text_color_aqua,profile_text_color_black,profile_text_color_blue,profile_text_color_fuchsia,profile_text_color_gray,profile_text_color_green,profile_text_color_lime,profile_text_color_maroon,profile_text_color_navy,profile_text_color_olive,profile_text_color_purple,profile_text_color_red,profile_text_color_silver,profile_text_color_teal,profile_text_color_white,profile_text_color_yellow,profile_page_color,profile_page_color_aqua,profile_page_color_black,profile_page_color_blue,profile_page_color_fuchsia,profile_page_color_gray,profile_page_color_green,profile_page_color_lime,profile_page_color_maroon,profile_page_color_navy,profile_page_color_olive,profile_page_color_purple,profile_page_color_red,profile_page_color_silver,profile_page_color_teal,profile_page_color_white,profile_page_color_yellow,profile_theme_color,profile_theme_color_aqua,profile_theme_color_black,profile_theme_color_blue,profile_theme_color_fuchsia,profile_theme_color_gray,profile_theme_color_green,profile_theme_color_lime,profile_theme_color_maroon,profile_theme_color_navy,profile_theme_color_olive,profile_theme_color_purple,profile_theme_color_red,profile_theme_color_silver,profile_theme_color_teal,profile_theme_color_white,profile_theme_color_yellow,profile_view_size_custom,utc_offset,utc_offset__-10800.0,utc_offset__-14400.0,utc_offset__-18000.0,utc_offset__-21600.0,utc_offset__-25200.0,utc_offset__-28800.0,utc_offset__-36000.0,utc_offset__-7200.0,utc_offset__10800.0,utc_offset__14400.0,utc_offset__19800.0,utc_offset__25200.0,utc_offset__28800.0,utc_offset__32400.0,utc_offset__3600.0,utc_offset__39600.0,utc_offset__7200.0,utc_offset__other,utc_offset__unk,location_clean,location_top,location_top_brazil,location_top_france,location_top_india,location_top_indonesia,location_top_mexico,location_top_other,location_top_spain,location_top_turkey,location_top_united kingdom,location_top_united states,location_top_worldwide,location_binary,location_public_binary,user_language,user_lang_ar,user_lang_de,user_lang_en,user_lang_en-gb,user_lang_es,user_lang_fr,user_lang_it,user_lang_ja,user_lang_ko,user_lang_nl,user_lang_other,user_lang_pt,user_lang_ru,user_lang_tr,profile_creation_timestamp,profile_creation_year,profile_creation_month,profile_creation_day,profile_creation_hour,profile_creation_minute,profile_creation_second,user_timezone,user_timezone_alaska,user_timezone_amsterdam,user_timezone_arizona,user_timezone_athens,user_timezone_atlantic time (canada),user_timezone_baghdad,user_timezone_bangkok,user_timezone_berlin,user_timezone_bern,user_timezone_bogota,user_timezone_brasilia,user_timezone_buenos aires,user_timezone_cairo,user_timezone_caracas,user_timezone_casablanca,user_timezone_central america,user_timezone_central time (us & canada),user_timezone_chennai,user_timezone_dublin,user_timezone_eastern time (us & canada),user_timezone_greenland,user_timezone_hawaii,user_timezone_istanbul,user_timezone_jakarta,user_timezone_kuala lumpur,user_timezone_kuwait,user_timezone_london,user_timezone_madrid,user_timezone_mexico city,user_timezone_mountain time (us & canada),user_timezone_mumbai,user_timezone_muscat,user_timezone_nairobi,user_timezone_new delhi,user_timezone_other,user_timezone_pacific time (us & canada),user_timezone_paris,user_timezone_pretoria,user_timezone_quito,user_timezone_riyadh,user_timezone_rome,user_timezone_santiago,user_timezone_seoul,user_timezone_sydney,user_timezone_tokyo,user_timezone_unk,user_timezone_volgograd,avg_daily_profile_visit_duration,num_followers_log,num_people_following_log,num_status_updates_log,num_direct_messages_log,avg_daily_profile_clicks_log,profile_category_business,profile_category_celebrity,profile_category_government,profile_category_unknown,image_html_color,user_name_length_norm,profile_creation_year_norm,profile_creation_month_norm,profile_creation_day_norm,profile_creation_hour_norm,profile_creation_minute_norm,profile_creation_second_norm,num_followers_log_norm,num_people_following_log_norm,num_status_updates_log_norm,num_direct_messages_log_norm,avg_daily_profile_visit_duration_norm,avg_daily_profile_clicks_log_norm,user_name_length_bins,utc_offset_bins,user_language_bins,user_timezone_bins,location_clean_bins,image_html_color_bins,profile_creation_day_bins,profile_creation_hour_bins
0,49I3SOKLI2CMNGP4,_FwKTE4dm,,Set,Not verified,fa0a86,fc37c4,0a0101,False,-18000.0,"200 E Broadway, Suite 3D, New York NY 10002",Enabled,en,Mon Jul 20 21:05:24 +0000 2009,Quito,31528,2148,12926,469,business,13.827,3.4062,49I3SOKLI2CMNGP4.png,361,#8f5ea8,[143 94 168],9,0,1,1,0,0,fuchsia,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,fuchsia,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-18000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,united states,united states,0,0,0,0,0,0,0,0,0,1,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009-07-20 21:05:24+00:00,2009,7,20,21,5,24,Quito,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,13.827,5.184932,3.857493,4.742255,3.120443,1.04577,1,0,0,0,lightslategray,0.538462,0.181818,0.545455,0.633333,0.913043,0.084746,0.40678,0.525031,0.548148,0.569373,0.529079,0.283186,0.529476,1,4,6,4,5.0,4.0,2,2
1,727IRIR59A3P88LK,tDMVTsvWbdpud,,Not set,Not verified,ff0000,e6e6e6,001941,False,36000.0,"ÜT: -28.108029,153.410307",Enabled,en,Wed Aug 05 22:31:34 +0000 2009,Brisbane,42857,262,11678,1806,government,24.019,3.2656,727IRIR59A3P88LK.png,598,#84908b,[132 144 139],13,0,0,1,0,0,red,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,navy,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,other,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"ut: -,",other,0,0,0,0,0,1,0,0,0,0,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009-08-05 22:31:34+00:00,2009,8,5,22,31,34,other,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,24.019,5.337631,2.844119,4.691942,3.772693,1.032151,0,0,1,0,gray,0.846154,0.181818,0.636364,0.133333,0.956522,0.525424,0.576271,0.545345,0.404148,0.561953,0.63967,0.49491,0.52021,1,4,6,5,2.0,4.0,2,2
2,LN95SD15SRPCEE8F,k_TUXA297j,https://blob/5ljjaio2rp.com,Set,Verified,0d0101,000000,000000,False,-25200.0,@happyhippiefdn,Enabled,en,Fri Mar 18 18:36:02 +0000 2011,Pacific Time (US & Canada),40237847,392,9395,59247,unknown,26.011,12.1619,LN95SD15SRPCEE8F.png,1806,#385d55,[56 93 85],10,1,1,0,0,1,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,black,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-25200.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,@happyhippiefdn,other,0,0,0,0,0,1,0,0,0,0,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2011-03-18 18:36:02+00:00,2011,3,18,18,36,2,Pacific Time (US & Canada),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,26.011,8.755317,3.034904,4.584231,5.498835,1.501272,0,0,0,1,darkslategray,0.615385,0.363636,0.181818,0.566667,0.782609,0.610169,0.033898,1.0,0.431259,0.546069,0.932342,0.536291,0.839384,1,5,6,5,0.0,4.0,2,2
3,TB11I7F0PN033D4T,HRDr8yt4s7M,,Set,Verified,0000ff,e0ff92,9ae4e8,False,-25200.0,"Austin, TX",??,en,Sun Apr 08 05:45:46 +0000 2007,Pacific Time (US & Canada),230166,702,10507,2695,celebrity,26.127,2.1131,TB11I7F0PN033D4T.png,2439,#748d9b,[116 141 155],11,0,1,0,0,1,blue,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,silver,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,silver,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-25200.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,united states,united states,0,0,0,0,0,0,0,0,0,1,0,1,2,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2007-04-08 05:45:46+00:00,2007,4,8,5,45,46,Pacific Time (US & Canada),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,26.127,6.17536,3.314014,4.639607,3.968656,0.897577,0,1,0,0,lightslategray,0.692308,0.0,0.272727,0.233333,0.217391,0.762712,0.779661,0.656788,0.47092,0.554235,0.672896,0.538701,0.42865,1,5,6,5,5.0,4.0,1,1
4,32PSGCK5PATHMR07,PXOI0egSsDqrve,https://blob/t3hjJ0c/Mo.com,Set,Not verified,992f09,ddeef6,ffffff,False,,"New Rochelle, NY",Enabled,en,Mon Jun 04 19:51:04 +0000 2012,,1199,1701,1022,8,unknown,17.878,3.966,32PSGCK5PATHMR07.png,259,#497aa2,[ 73 122 162],14,1,1,1,0,0,maroon,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,white,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,unk,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,united states,united states,0,0,0,0,0,0,0,0,0,1,0,1,1,en,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2012-06-04 19:51:04+00:00,2012,6,4,19,51,4,unk,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,17.878,3.573092,3.743443,3.49556,1.342454,1.095769,0,0,0,1,steelblue,0.923077,0.454545,0.454545,0.1,0.826087,0.864407,0.067797,0.310608,0.531942,0.38552,0.227617,0.36734,0.563493,1,3,6,4,5.0,4.0,2,2


## Define the Features that will be Included in Modeling

In [7]:
def selectFeatures(df):
    """Select features to be included in the models
    
    Args:
        df (DataFrame): Dataframe containing all variables to select from
    Return:
        features_cols (list): List of feature names to include in training dataset
    
    """
    # Select each set of one-hot encoded features 
    verification_status = [col for col in df.columns if ('verification_status_' in col)]
    profile_text_color = [col for col in df.columns if ('profile_text_color_' in col)]
    profile_page_color = [col for col in df.columns if ('profile_page_color_' in col)]
    profile_theme_color = [col for col in df.columns if ('profile_theme_color_' in col)]
    utc_offset = [col for col in df.columns if ('utc_offset_' in col)]
    numerical_norm = ['user_name_length', 'num_followers_log_norm', 'num_people_following_log_norm', 'num_status_updates_log_norm', 'num_direct_messages_log_norm', 'avg_daily_profile_visit_duration_norm', 'avg_daily_profile_clicks_log_norm']
    profile_category = [col for col in df.columns if ('profile_category_' in col)]
    user_timezone = [col for col in df.columns if ('user_timezone_' in col)]
    user_lang = [col for col in df.columns if ('user_lang_' in col)]
    profile_creation_norm = ['profile_creation_year_norm', 'profile_creation_month_norm', 'profile_creation_day_norm', 'profile_creation_hour_norm', 'profile_creation_minute_norm', 'profile_creation_second_norm']
    binary = ['personal_url_binary', 'location_binary', 'location_public_binary', 'cover_image_binary', 'profile_view_size_custom']
    location_top = [col for col in df.columns if ('location_top_') in col]
    
    # Select the ones we want to use
    #features_cols = misc + verification_status + profile_text_color + profile_page_color + profile_theme_color + numerical + profile_category + profile_creation_norm + user_timezone + user_lang + utc_offset
    #features_cols = numerical_norm + binary + ['profile_creation_year_norm'] + verification_status + user_lang + profile_category 
    #features_cols = ['user_name_length_norm','num_followers_log_norm', 'num_people_following_log_norm', 'personal_url_binary', 'num_status_updates_log_norm', 'num_direct_messages_log_norm', 'location_public_binary', 'profile_creation_year_norm', 'avg_daily_profile_clicks_log_norm', 'avg_daily_profile_visit_duration_norm'] + verification_status + profile_category
    features_cols = numerical_norm + binary + verification_status + profile_category + profile_creation_norm + ['utc_offset_bins', 'user_timezone_bins', 'user_language_bins', 'user_name_length_bins', 'location_top_united states', 'image_html_color_bins', 'profile_creation_day_bins', 'profile_creation_hour_bins'] + profile_text_color + profile_page_color + profile_theme_color
    features_cols = numerical_norm + binary + verification_status + profile_category + profile_creation_norm + ['utc_offset_bins', 'user_timezone_bins', 'user_language_bins', 'user_name_length_bins']
    
    # Return list of features
    return features_cols

### Common Train and Test Features

In [8]:
# There might be features (after one-hot) that are in test but not in train, or vice-versa
common_features = list(set(selectFeatures(df_train_full)) & set(selectFeatures(df_test_full)))
discarded_features = np.setdiff1d(selectFeatures(df_train_full), selectFeatures(df_test_full))
print(f'Discarded features (in train but not in test or vice versa): {discarded_features}')

Discarded features (in train but not in test or vice versa): []


### Define Features and Target

In [9]:
# Select the features to consider in the feature selection, and define target variable
X_train_full = df_train_full[common_features]
X_test_full = df_test_full[common_features]
y_train_full = df_train_full['num_profile_likes_log']

### Train Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, train_size=0.3, random_state=42)

### Feature Selection
We can use filter feature selection to measure the correlation of the features with regards to the target variable. Doing so, we can narrow down on a selected number of features and discard the ones with negligeable impact. The function `featureSelection` take as input an initial set of X_train, X_test and would return the same datasets but with `K` best features.

In [11]:
def featureSelection(X_train, X_test, y_train, scoring, bestK):
    """ Based on a correlation statistic, finds a subset of features to include in the datasets.
    
    Args:
        X_train (DataFrame): Training dataset with all features.
        X_test (DataFrame): Test dataset with all features.
        y_train (Series): Training target values.
        scoring (function): Correlation statistic tests. (ref: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection)
        bestK (int): The k best features with highest scores to include in model ('all' to keep every features).
    Return:
        X_train_fs (DataFrame): Training dataset with subset of best features.
        X_test_fs (DataFrame): Test dataset with subset of best features.
        scores (DataFrame): All the features with corresponding scores.
    """
    
    # Feature extraction based on correlations with target
    fs = SelectKBest(score_func=scoring, k=bestK)
    fs.fit(X_train, y_train)
    
    # Keep only the best K features
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)

    # Create dataframe of features and scores
    scores = pd.DataFrame()
    for i in range(len(fs.scores_)):
        scores = scores.append(
            pd.DataFrame({'feature':[X_train.columns[i]], 'score':[fs.scores_[i]]}),
            ignore_index=True)
    scores = scores.sort_values('score', ascending=False).reset_index(drop=True)
    
    # Return train and test dataframes with subset of features, and all the scores
    return X_train_fs, X_test_fs, scores

In [12]:
# Select subset of features to include in the models
bestK = 20
X_train_fs, X_test_fs, scores = featureSelection(X_train, X_test, y_train, f_regression, bestK)

In [13]:
# All the feature swith their scores
scores.head(bestK)

Unnamed: 0,feature,score
0,personal_url_binary,699.350805
1,num_people_following_log_norm,439.276886
2,num_status_updates_log_norm,380.10663
3,user_timezone_bins,276.233551
4,utc_offset_bins,208.02196
5,profile_category_unknown,183.442828
6,location_public_binary,156.342123
7,user_language_bins,140.609496
8,verification_status_verified,109.011672
9,avg_daily_profile_clicks_log_norm,89.489571


## Single Runs
Fit and evaluate models on a single training-validation split for quick tests. The `evaluate` function simply calculates the performance of a model by using RMSE or RMSLE. It can be used in both single runs and during cross-validation. The `singleRun` function splits the training set into training and validation for model fitting and evaluation. As the name implies, the model is only evaluated on one specific split of the training dataset.

In [14]:
def evaluate(predictions, y_test, metric):
    """ Function to evaluate the performance of model based on different metrics.
    
    Args:
        model (model): Classifier or regressor model to use in the prediction.
        y_test (ndarray): The training target values.
        test (string): The type of test (rmse or rmsle).
    Return:
        score (float): The average score.
    """
    
    # Calculate score
    if metric == 'rmse':
        score = np.sqrt(np.square(predictions - y_test).mean())
    elif metric == 'rmsle':
        score = np.sqrt(np.square(np.log(predictions + 1) - np.log(y_test + 1)).mean())
    else:
        score = 'Error'
    
    print(f'Score ({metric.upper()}): {score:.3f}')
    
    return score

In [15]:
def singleRun(model, features, target):
    """ Function evaluates a model on a single train/validation split.
    
    Args:
        model (model): Classifier or regressor model to use in the prediction
        features (ndarray): The training features.
        target (ndarray): The training target values.
    """
    # Split df_train into training and validation datasets
    X_train, X_valid, y_train, y_valid = train_test_split(features, target, 
                                                          test_size=0.25, shuffle=True, random_state=42)
    
    # Fit model using training set
    model.fit(X_train, y_train)
    
    # Predict using fitted model on test set
    predictions = model.predict(X_valid)
    
    # Evaluate model using validation set
    evaluate(predictions, y_valid, 'rmse')

## Regressors
Here we can quickly test different regressor without any hyperparameter tuning just to have an overall idea of the models' baseline performance.

### Ridge Regression

In [16]:
lmbr = Ridge()
singleRun(lmbr, X_train_fs, y_train)

Score (RMSE): 1.842


### Poisson Regression

In [276]:
pr = PoissonRegressor()
singleRun(pr, X_train_fs, y_train)

Score (RMSE): 1.978


### SVR

In [259]:
svr = SVR(kernel='poly', degree=4, C=15, coef0=2)
singleRun(svr, X_train_fs, y_train)

Score (RMSE): 1.856


### Decision Tree

In [50]:
dt = DecisionTreeRegressor()
singleRun(dt, X_train_fs, y_train)

Score (RMSE): 2.507


### Random Forest

In [51]:
rf = RandomForestRegressor()
singleRun(rf, X_train_fs, y_train)

Score (RMSE): 1.737


### AdaBoost

In [52]:
ab = AdaBoostRegressor()
singleRun(ab, X_train_fs, y_train)

Score (RMSE): 1.984


### Bagging

In [108]:
br = BaggingRegressor()
singleRun(br, X_train_fs, y_train)

Score (RMSE): 1.823


### XGBoost

In [109]:
xgb = XGBRegressor(subsample=0.7, max_depth=3, n_estimators=50, objective='reg:tweedie', verbosity=1)
singleRun(xgb, X_train_fs, y_train)

Score (RMSE): 1.756


## Cross-Validation
Now, instead of doing a single test of the models, we can do k-fold Cross-Validation. The function `KFoldCV` fits a model using the training set and evaluate the model's performance `K` times, each time using a different validation set. Of course, we can also use `cross_val_score()` to accomplish the same thing. 

For pros and cons, ref: https://stackoverflow.com/questions/60938685/what-is-the-difference-between-getting-score-value-from-kfold-fit-score-vs-using#:~:text=A%20limitation%20of%20cross_val_score%20is,necessary%20to%20use%20for%20%2B%20kfold%20.

In [19]:
def KFoldCV(model, k, X, y, metric):
    """ Custom function to carry out K-Fold Cross-Validation on a training dataset manually.
    It allows for more flexibility as compared to cross_val_score().
    Args:
        model (model): Model used for prediction.
        k (int): The number of folds in CV.
        X (DataFrame/ndarray): Training dataset.
        y (Series/ndarray): Training target values.
        metric (string): The metric used to measure model performance ('rmse' or 'rmsle').
    """
    # K-Fold CV
    kf = KFold(n_splits=k)
    # Initialize list for scores
    scores = []
    # Ensure that X and y are ndarray
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
    if isinstance(y, pd.Series):
        y = y.to_numpy()
        
    print('Manual K-Fold Cross-Validation:')
    
    # Carry out K-Fold CV
    for train_index, valid_index in kf.split(X):
        # Split data
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        # Fit model on training set
        model.fit(X_train, y_train)
        # Predict with validation set
        predictions = model.predict(X_valid)
        # Evaluate model
        scores.append(evaluate(predictions, y_valid, metric))
    
    scores = np.array(scores)
    print(f'Mean Accuracy: {scores.mean():0.3f} (+/- {scores.std() * 2:0.3f})')
    print('========================================')

In [18]:
def sklearnKFoldCV(model, k, X, y, metric):
    
    if metric == 'rmse':
        scoring_metric = 'neg_root_mean_squared_error'
        
    print('Sklearn K-Fold Cross-Validation:')
        
    cv=KFold(n_splits=k, shuffle=True, random_state=42)
    scores = abs(cross_val_score(model, X, y, scoring=scoring_metric, cv=cv))
    _ = [print(f'Score (RMSE): {score:.3f}') for score in scores]
    print(f'Mean Accuracy: {scores.mean():0.3f} (+/- {scores.std() * 2:0.3f})')

### Ridge Regression

In [20]:
lmbr = Ridge(alpha=1, solver='saga')
KFoldCV(lmbr, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(lmbr, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.809
Score (RMSE): 1.751
Score (RMSE): 1.775
Score (RMSE): 1.933
Score (RMSE): 1.864
Mean Accuracy: 1.827 (+/- 0.131)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.834
Score (RMSE): 1.739
Score (RMSE): 1.776
Score (RMSE): 1.957
Score (RMSE): 1.785
Mean Accuracy: 1.818 (+/- 0.151)


In [244]:
# Fit model on training set
lmbr.fit(X_train_fs, y_train)
predictions = lmbr.predict(X_test_fs)
evaluate(predictions, y_test, 'rmse')

Score (RMSE): 1.826


1.8255169945408902

### Poisson Regression

In [245]:
pr = PoissonRegressor(fit_intercept=True, max_iter=500)
KFoldCV(pr, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(pr, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.902
Score (RMSE): 1.938
Score (RMSE): 1.965
Score (RMSE): 2.123
Score (RMSE): 2.000
Mean Accuracy: 1.986 (+/- 0.151)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.974
Score (RMSE): 1.953
Score (RMSE): 1.974
Score (RMSE): 2.093
Score (RMSE): 1.925
Mean Accuracy: 1.984 (+/- 0.115)


In [246]:
# Fit model on training set
pr.fit(X_train_fs, y_train)
predictions = pr.predict(X_test_fs)
evaluate(predictions, y_test, 'rmse')

Score (RMSE): 2.041


2.0409089622400165

### SVR

In [292]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
hyper_space = {'kernel':['linear', 'poly'], 'degree':[2,3,4], 'C':[0.1, 1, 10, 25], 'coef0':[0,1,2,5,10]}
svr_search = RandomizedSearchCV(SVR(), hyper_space, n_iter=50, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv, random_state=1)
result = svr_search.fit(X_train_fs, y_train)

In [294]:
print(result.best_score_)
result.best_params_

-1.8319763151717599


{'kernel': 'linear', 'degree': 3, 'coef0': 1, 'C': 25}

In [296]:
svr = SVR(kernel='linear', degree=3, C=25, coef0=1)
KFoldCV(svr, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(svr, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.791
Score (RMSE): 1.750
Score (RMSE): 1.809
Score (RMSE): 1.943
Score (RMSE): 1.894
Mean Accuracy: 1.837 (+/- 0.141)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.844
Score (RMSE): 1.758
Score (RMSE): 1.807
Score (RMSE): 1.965
Score (RMSE): 1.785
Mean Accuracy: 1.832 (+/- 0.144)


In [297]:
svr.fit(X_train_fs, y_train)
predictions = svr.predict(X_test_fs)
evaluate(predictions, y_test, 'rmse')

Score (RMSE): 1.843


1.8427039321731264

In [238]:
# Fit model and predict
svr.fit(X_train_full, y_train_full)
svr_predictions = svr.predict(X_test_full)

### Decision Tree

In [235]:
dt = DecisionTreeRegressor(criterion='mse', max_depth=5, min_samples_split=5)
KFoldCV(dt, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(dt, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.838
Score (RMSE): 1.933
Score (RMSE): 1.948
Score (RMSE): 2.005
Score (RMSE): 2.071
Mean Accuracy: 1.959 (+/- 0.155)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.969
Score (RMSE): 1.840
Score (RMSE): 1.915
Score (RMSE): 2.165
Score (RMSE): 1.860
Mean Accuracy: 1.950 (+/- 0.233)


### Random Forest

In [25]:
rf = RandomForestRegressor(n_estimators=30, max_depth=4, min_samples_split=5)
KFoldCV(rf, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(rf, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.740
Score (RMSE): 1.791
Score (RMSE): 1.860
Score (RMSE): 1.946
Score (RMSE): 1.969
Mean Accuracy: 1.861 (+/- 0.175)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.847
Score (RMSE): 1.734
Score (RMSE): 1.830
Score (RMSE): 2.003
Score (RMSE): 1.775
Mean Accuracy: 1.838 (+/- 0.184)


In [248]:
rf.fit(X_train_fs, y_train)
predictions = rf.predict(X_test_fs)
evaluate(predictions, y_test, 'rmse')

Score (RMSE): 1.785


1.7852962920386277

In [249]:
# Fit model and predict
rf.fit(X_train_full, y_train_full)
rf_predictions = rf.predict(X_test_full)

### AdaBoost

In [21]:
#ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_features=None, max_depth=8), learning_rate=0.01, random_state=42)
ab = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                       n_estimators=500, learning_rate=0.01, loss='linear')

ab = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_features=None, max_depth=8), 
                     learning_rate=0.01, random_state=42)
KFoldCV(ab, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(ab, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.703
Score (RMSE): 1.807
Score (RMSE): 1.812
Score (RMSE): 1.868
Score (RMSE): 1.970
Mean Accuracy: 1.832 (+/- 0.174)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.779
Score (RMSE): 1.610
Score (RMSE): 1.849
Score (RMSE): 1.994
Score (RMSE): 1.743
Mean Accuracy: 1.795 (+/- 0.253)


In [84]:
ab.fit(X_train_fs, y_train)
predictions = ab.predict(X_test_fs)
evaluate(predictions, y_test, 'rmse')

Score (RMSE): 1.820


1.8200477033130509

In [20]:
# Fit model and predict
ab.fit(X_train_full, y_train_full)
ab_predictions = ab.predict(X_test_full)

### Bagging

In [33]:
br = BaggingRegressor()
KFoldCV(br, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(br, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.808
Score (RMSE): 1.837
Score (RMSE): 1.797
Score (RMSE): 1.835
Score (RMSE): 1.902
Mean Accuracy: 1.836 (+/- 0.073)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.883
Score (RMSE): 1.805
Score (RMSE): 1.848
Score (RMSE): 1.786
Score (RMSE): 1.867
Mean Accuracy: 1.838 (+/- 0.074)


### XGBoost

In [22]:
xgb = XGBRegressor(subsample=0.6, max_depth=2, n_estimators=25, objective='reg:tweedie', tweedie_variance_power=1.2, 
                   eta=0.3, min_child_weight=5, colsample_bytree=.85, colsample_bylevel=.95, colsample_bynode=1,
                   reg_lambda=3, reg_alpha=3, max_delta_step=4, verbosity=1)
KFoldCV(xgb, 5, X_train_fs, y_train, 'rmse')
sklearnKFoldCV(xgb, 5, X_train_fs, y_train, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.841
Score (RMSE): 1.746
Score (RMSE): 1.803
Score (RMSE): 1.908
Score (RMSE): 1.809
Mean Accuracy: 1.821 (+/- 0.106)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.860
Score (RMSE): 1.689
Score (RMSE): 1.764
Score (RMSE): 2.035
Score (RMSE): 1.799
Mean Accuracy: 1.829 (+/- 0.233)


In [194]:
xgb.fit(X_train_fs, y_train)
predictions = xgb.predict(X_test_fs)
evaluate(predictions, y_test, 'rmse')

Score (RMSE): 1.805


1.8054333452687517

In [195]:
# Fit model and predict
xgb.fit(X_train_full, y_train_full)
xgb_predictions = xgb.predict(X_test_full)

## Prepare Submission File

In [250]:
# Choose model and transform predictions
predictions = rf_predictions
predictions = np.exp(predictions) - 1
submission = pd.DataFrame({'Id':df_test['Id'], 'Predicted':predictions})

In [251]:
submission

Unnamed: 0,Id,Predicted
0,49I3SOKLI2CMNGP4,3767.186930
1,727IRIR59A3P88LK,2632.635560
2,LN95SD15SRPCEE8F,194.398818
3,TB11I7F0PN033D4T,2995.236204
4,32PSGCK5PATHMR07,182.438755
...,...,...
2495,7SDJE48EFRPPNEJK,128.041833
2496,PRT8RDNG6E86518P,3414.902886
2497,SRNIBIK27BQ2M3PB,4092.923394
2498,6CP232J9R8N84702,1206.001784


In [261]:
submission # with image

Unnamed: 0,Id,Predicted
0,49I3SOKLI2CMNGP4,2304.068836
1,727IRIR59A3P88LK,3142.269289
2,LN95SD15SRPCEE8F,53.796720
3,TB11I7F0PN033D4T,2569.711134
4,32PSGCK5PATHMR07,171.302285
...,...,...
2495,7SDJE48EFRPPNEJK,96.702851
2496,PRT8RDNG6E86518P,4262.998072
2497,SRNIBIK27BQ2M3PB,4149.794840
2498,6CP232J9R8N84702,961.786669


In [73]:
submission

Unnamed: 0,Id,Predicted
0,49I3SOKLI2CMNGP4,3044.006715
1,727IRIR59A3P88LK,3219.120654
2,LN95SD15SRPCEE8F,38.978772
3,TB11I7F0PN033D4T,3295.636021
4,32PSGCK5PATHMR07,129.458775
...,...,...
2495,7SDJE48EFRPPNEJK,94.791835
2496,PRT8RDNG6E86518P,5340.517952
2497,SRNIBIK27BQ2M3PB,4496.916860
2498,6CP232J9R8N84702,1079.672196


In [36]:
submission

Unnamed: 0,Id,Predicted
0,49I3SOKLI2CMNGP4,2506.916613
1,727IRIR59A3P88LK,3453.149283
2,LN95SD15SRPCEE8F,54.826055
3,TB11I7F0PN033D4T,2640.461249
4,32PSGCK5PATHMR07,110.090628
...,...,...
2495,7SDJE48EFRPPNEJK,98.276235
2496,PRT8RDNG6E86518P,5012.459819
2497,SRNIBIK27BQ2M3PB,4715.141227
2498,6CP232J9R8N84702,915.006561


### Export Submission File

In [252]:
submission.to_csv('submission_RF_014-3.csv', index=False)