In [37]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel

# To make the notebook's output stable across runs
random_seed = 12182017
np.random.seed(random_seed)

def rmsle(y, y_pred):
    '''
    A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
    '''
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [2]:
# imports the data
data = {
    'tra': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\air_visit_data.csv'),
    'as': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\air_store_info.csv'),
    'hs': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\hpg_store_info.csv'),
    'ar': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\air_reserve.csv'),
    'hr': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\hpg_reserve.csv'),
    'id': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\store_id_relation.csv'),
    'tes': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\sample_submission.csv'),
    'hol': pd.read_csv(r'C:\Users\Adrian\Google Drive\Datasets\Restaurant-Visitors\date_info.csv').rename(columns={'calendar_date': 'visit_date'})
}

In [3]:
# Keeps only hpg reservation data that uses the air system
data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

In [4]:
# Process the air and hpg reservation data
for df in ['ar', 'hr']:
    # Converts 'visit_datetime' to proper date_time format
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    # Converts 'visit_datetime' to date in proper format
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    # Converts 'reserve_datetime' to proper date_time format
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    # Converts 'reserve_datetime' to date in proper format
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    # Calculates the difference in days between when the reservation and visit is made (RVD)
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    # Calculates the total number of reserve visitors for each 'air_store_id' on each 'visit_date
    # Calculates the RVD for each 'air_store_id' on each 'visit_date
    data[df] = data[df].groupby(['air_store_id', 'visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime': 'visit_date'})
    print(data[df].head())
    print('')

           air_store_id  visit_date  reserve_datetime_diff  reserve_visitors
0  air_00a91d42b08b08d9  2016-10-31                      0                 2
1  air_00a91d42b08b08d9  2016-12-05                      4                 9
2  air_00a91d42b08b08d9  2016-12-14                      6                18
3  air_00a91d42b08b08d9  2016-12-17                      6                 2
4  air_00a91d42b08b08d9  2016-12-20                      2                 4

           air_store_id  visit_date  reserve_datetime_diff  reserve_visitors
0  air_00a91d42b08b08d9  2016-01-14                      3                 2
1  air_00a91d42b08b08d9  2016-01-15                      6                 4
2  air_00a91d42b08b08d9  2016-01-16                      3                 2
3  air_00a91d42b08b08d9  2016-01-22                      3                 2
4  air_00a91d42b08b08d9  2016-01-29                      6                 5



In [5]:
# tra is short for training
# Converts 'visit_date' to date in proper format
data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
# Gets the numeric day of the week (dow) for each 'visit_date'
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
# Gets the year for each 'visit_date'
data['tra']['year'] = data['tra']['visit_date'].dt.year
# Gets the numeric month for each 'visit_date'
data['tra']['month'] = data['tra']['visit_date'].dt.month
# Converts 'visit_date' to proper date format
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tra'].head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1


In [6]:
# tes is short for testing
# Gets the 'visit_date' from the 'id' 
data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
# Gets the 'air_store_id' from the 'id' 
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
# Converts 'visit_date' to date in proper format
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
# Gets the numeric day of the week (dow) for each 'visit_date'
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
# Gets the year for each 'visit_date'
data['tes']['year'] = data['tes']['visit_date'].dt.year
# Gets the numeric month for each 'visit_date'
data['tes']['month'] = data['tes']['visit_date'].dt.month
# Converts 'visit_date' to proper date format
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

data['tes'].head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4


In [7]:
# Gets all the unique 'air_store_id' in the testing set
unique_stores = data['tes']['air_store_id'].unique()
# Creates a dataframe that for each unique 'air_store_id' has a dow
stores = pd.concat(
    [
        pd.DataFrame({
            'air_store_id': unique_stores,
            'dow': [i] * len(unique_stores)
        }) for i in range(7)
    ],
    axis=0, ignore_index=True).reset_index(drop=True)

stores.head()

Unnamed: 0,air_store_id,dow
0,air_00a91d42b08b08d9,0
1,air_0164b9927d20bcc3,0
2,air_0241aa3964b7f861,0
3,air_0328696196e46f18,0
4,air_034a3d5b40d5b1b1,0


In [8]:
# For each combination of 'air_store_id' and 'dow' in the training set the 
# min, mean, median, max, count, and varianace is calculated. Then merged into the stores dataset.

tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].min().rename(columns={
        'visitors': 'min_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].mean().rename(columns={
        'visitors': 'mean_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].median().rename(columns={
        'visitors': 'median_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].max().rename(columns={
        'visitors': 'max_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].count().rename(columns={
        'visitors': 'count_observations'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].var().rename(columns={
        'visitors': 'var_visitors'
    })   
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,var_visitors
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,88.843697
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,27.315789
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,22.945212
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,50.628788
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,134.009009


In [9]:
# Adds air store information (genre_name, lat, lut)
stores = pd.merge(stores, data['as'], how='left', on=['air_store_id'])

stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,var_visitors,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,88.843697,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,27.315789,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,22.945212,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,50.628788,Dining bar,Ōsaka-fu Ōsaka-shi Nakanochō,34.701279,135.52809
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,134.009009,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229


In [10]:
# Process date_info.csv
# Corrects visit_date
data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date

In [11]:
# Merges in date_info data
train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date'])
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date'])

train = pd.merge(data['tra'], stores, how='left', on=['air_store_id', 'dow'])
test = pd.merge(data['tes'], stores, how='left', on=['air_store_id', 'dow'])

In [12]:
for df in ['ar', 'hr']:
    train = pd.merge(
        train, data[df], how='left', on=['air_store_id', 'visit_date'])
    test = pd.merge(
        test, data[df], how='left', on=['air_store_id', 'visit_date'])

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252108 entries, 0 to 252107
Data columns (total 20 columns):
air_store_id               252108 non-null object
visit_date                 252108 non-null object
visitors                   252108 non-null int64
dow                        252108 non-null int64
year                       252108 non-null int64
month                      252108 non-null int64
min_visitors               250468 non-null float64
mean_visitors              250468 non-null float64
median_visitors            250468 non-null float64
max_visitors               250468 non-null float64
count_observations         250468 non-null float64
var_visitors               250420 non-null float64
air_genre_name             250468 non-null object
air_area_name              250468 non-null object
latitude                   250468 non-null float64
longitude                  250468 non-null float64
reserve_datetime_diff_x    28064 non-null float64
reserve_visitors_x         28064 no

In [15]:
train.drop(['reserve_datetime_diff_x', 'reserve_visitors_x', 'reserve_datetime_diff_y', 'reserve_visitors_y'], axis=1, inplace=True)

In [20]:
restaurant_df = train.copy()

restaurant_df['alcohol_served'] = np.NaN
restaurant_df['food_served'] = np.NaN
restaurant_df['themed_establishment'] = np.NaN
restaurant_df['asian_establishment'] = np.NaN

air_genres = list(restaurant_df['air_genre_name'].value_counts().index.values)
for genre in air_genres:
    # 'alcohol_served''
    if (genre in ['Izakaya', 'Dining bar', 'Bar/Cocktail',  'Karaoke/Party']): 
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'alcohol_served'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'alcohol_served'] = 0  
    # 'food_served'
    if (genre in ['Izakaya', 'Cafe/Sweets', 'Dining bar', 'Italian/French', 'Japanese food', 'Yakiniku/Korean food', 
                 'Western food', 'Creative cuisine', 'Okonomiyaki/Monja/Teppanyaki', 'International cuisine']):
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'food_served'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'food_served'] = 0
    # 'themed_establishment'
    if (genre in ['Karaoke/Party']):
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'themed_establishment'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'themed_establishment'] = 0  
    # 'asian_establishment'
    if (genre in ['Izakaya', 'Japanese food', 'Yakiniku/Korean food', 'Okonomiyaki/Monja/Teppanyaki', 'Asian', 'Karaoke/Party']): 
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'asian_establishment'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'asian_establishment'] = 0
        
train = restaurant_df.copy()        

In [19]:
train = restaurant_df.copy()        

In [21]:
train.drop(['air_genre_name', 'air_area_name'], axis=1, inplace=True)

In [22]:
restaurant_df = test.copy()

restaurant_df['alcohol_served'] = np.NaN
restaurant_df['food_served'] = np.NaN
restaurant_df['themed_establishment'] = np.NaN
restaurant_df['asian_establishment'] = np.NaN

air_genres = list(restaurant_df['air_genre_name'].value_counts().index.values)
for genre in air_genres:
    # 'alcohol_served''
    if (genre in ['Izakaya', 'Dining bar', 'Bar/Cocktail',  'Karaoke/Party']): 
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'alcohol_served'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'alcohol_served'] = 0  
    # 'food_served'
    if (genre in ['Izakaya', 'Cafe/Sweets', 'Dining bar', 'Italian/French', 'Japanese food', 'Yakiniku/Korean food', 
                 'Western food', 'Creative cuisine', 'Okonomiyaki/Monja/Teppanyaki', 'International cuisine']):
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'food_served'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'food_served'] = 0
    # 'themed_establishment'
    if (genre in ['Karaoke/Party']):
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'themed_establishment'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'themed_establishment'] = 0  
    # 'asian_establishment'
    if (genre in ['Izakaya', 'Japanese food', 'Yakiniku/Korean food', 'Okonomiyaki/Monja/Teppanyaki', 'Asian', 'Karaoke/Party']): 
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'asian_establishment'] = 1
    else:
        restaurant_df.loc[restaurant_df['air_genre_name'] == genre, 'asian_establishment'] = 0
        
test = restaurant_df.copy() 
test.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,min_visitors,mean_visitors,median_visitors,...,latitude,longitude,reserve_datetime_diff_x,reserve_visitors_x,reserve_datetime_diff_y,reserve_visitors_y,alcohol_served,food_served,themed_establishment,asian_establishment
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,2.0,2.0,2.0,...,35.694003,139.753595,,,,,0.0,1.0,0.0,0.0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,1.0,22.457143,19.0,...,35.694003,139.753595,,,,,0.0,1.0,0.0,0.0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,1.0,24.35,24.5,...,35.694003,139.753595,,,,,0.0,1.0,0.0,0.0
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,15.0,28.125,28.0,...,35.694003,139.753595,,,,,0.0,1.0,0.0,0.0
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,15.0,29.868421,30.0,...,35.694003,139.753595,,,,,0.0,1.0,0.0,0.0


In [24]:
test.drop(['reserve_datetime_diff_x', 'reserve_visitors_x', 'reserve_datetime_diff_y',
            'reserve_visitors_y', 'air_genre_name', 'air_area_name', 'visit_date', 'air_store_id'], axis=1, inplace=True)
test.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,var_visitors,latitude,longitude,alcohol_served,food_served,themed_establishment,asian_establishment
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,2.0,2.0,2.0,2.0,1.0,,35.694003,139.753595,0.0,1.0,0.0,0.0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,1.0,22.457143,19.0,47.0,35.0,88.843697,35.694003,139.753595,0.0,1.0,0.0,0.0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,1.0,24.35,24.5,43.0,40.0,82.694872,35.694003,139.753595,0.0,1.0,0.0,0.0
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,15.0,28.125,28.0,52.0,40.0,78.88141,35.694003,139.753595,0.0,1.0,0.0,0.0
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,15.0,29.868421,30.0,47.0,38.0,51.036273,35.694003,139.753595,0.0,1.0,0.0,0.0


In [29]:
test.head()

Unnamed: 0,id,visitors,dow,year,month,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,var_visitors,latitude,longitude,alcohol_served,food_served,themed_establishment,asian_establishment
0,air_00a91d42b08b08d9_2017-04-23,0,6,2017,4,2.0,2.0,2.0,2.0,1.0,,35.694003,139.753595,0.0,1.0,0.0,0.0
1,air_00a91d42b08b08d9_2017-04-24,0,0,2017,4,1.0,22.457143,19.0,47.0,35.0,88.843697,35.694003,139.753595,0.0,1.0,0.0,0.0
2,air_00a91d42b08b08d9_2017-04-25,0,1,2017,4,1.0,24.35,24.5,43.0,40.0,82.694872,35.694003,139.753595,0.0,1.0,0.0,0.0
3,air_00a91d42b08b08d9_2017-04-26,0,2,2017,4,15.0,28.125,28.0,52.0,40.0,78.88141,35.694003,139.753595,0.0,1.0,0.0,0.0
4,air_00a91d42b08b08d9_2017-04-27,0,3,2017,4,15.0,29.868421,30.0,47.0,38.0,51.036273,35.694003,139.753595,0.0,1.0,0.0,0.0


In [30]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [33]:
X = train.drop('visitors', axis=1).copy()
y = train['visitors'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=221986)

print('X_train shape: {}'.format(X_train.shape))
print('y_test shape: {}'.format(y_test.shape))
print()
print('X_test shape: {}'.format(X_test.shape))
print('y_train shape: {}'.format(y_test.shape))

X_train shape: (201686, 15)
y_test shape: (50422,)

X_test shape: (50422, 15)
y_train shape: (50422,)


In [68]:
# Uses np.lop1p()
regr = DecisionTreeRegressor(random_state=221986, max_depth=7, max_features='auto', criterion='friedman_mse', splitter='best')
regr.fit(X_train, np.log1p(y_train))

print('Train Set RMSLE: {:.4f}'.format(rmsle(y_train, np.expm1((regr.predict(X_train))))))
print('Test Set RMSLE: {:.4f}'.format(rmsle(y_test, np.expm1((regr.predict(X_test))))))

Train Set RMSLE: 0.5189
Test Set RMSLE: 0.5263


In [54]:
# Print the feature ranking
importances = regr.feature_importances_
indices = np.argsort(importances)[::-1]
print('Features Importance')
for f in range(X_train.shape[1]):
    print("%d. %s (%f)" % (f + 1, X_train.columns[indices[f]], importances[indices[f]]))

Features Importance
1. median_visitors (0.804786)
2. mean_visitors (0.168614)
3. max_visitors (0.018722)
4. var_visitors (0.003150)
5. month (0.002818)
6. min_visitors (0.000942)
7. year (0.000531)
8. dow (0.000244)
9. latitude (0.000130)
10. count_observations (0.000063)
11. asian_establishment (0.000000)
12. themed_establishment (0.000000)
13. food_served (0.000000)
14. alcohol_served (0.000000)
15. longitude (0.000000)


In [72]:
# Uses np.lop1p()
regr = DecisionTreeRegressor(random_state=221986, max_depth=7, max_features='auto', criterion='mse', splitter='best')
regr.fit(X, np.log1p(y))

print('Train Set RMSLE: {:.4f}'.format(rmsle(y, np.expm1((regr.predict(X))))))

Train Set RMSLE: 0.5205


In [73]:
# Regression Forest with selected features, scaled and np.log1p()
rand_forest_reg = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features='auto', criterion='mse', n_jobs=-1)
rand_forest_reg.fit(X, np.log1p(y))

print('Train Set RMSLE: {:.4f}'.format(rmsle(y, np.expm1(rand_forest_reg.predict(X)))))

Train Set RMSLE: 0.5180


In [75]:
test.drop('visitors', axis=1, inplace=True)

In [79]:
test.set_index('id', inplace=True)

In [80]:
submission = pd.DataFrame()
submission['id'] = test.index
submission['visitors'] = np.expm1(rand_forest_reg.predict(test))

submission.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,1.878633
1,air_00a91d42b08b08d9_2017-04-24,19.711947
2,air_00a91d42b08b08d9_2017-04-25,22.203451
3,air_00a91d42b08b08d9_2017-04-26,26.632255
4,air_00a91d42b08b08d9_2017-04-27,28.388238


In [81]:
submission.to_csv('submission.csv', index=False)