# 1. Required libraries

In [1]:
import datetime
import feature_engg as engg
import feature_util as utl
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn import metrics
from sklearn import tree
from sklearn import linear_model
from sklearn import model_selection
import warnings

warnings.filterwarnings('ignore')

# 2. Load data

In [14]:
# cities
df_cities = pd.read_csv('../data/world_cities.txt', sep=",", encoding='latin1')
df_cities = df_cities[['country', 'city', 'lat', 'lon']]

# events
df_events = utl.get_df('../data/events.json', encoding='latin1')
df_events.columns = ['e_' + str(col) for col in df_events.columns]

# groups
df_groups = utl.get_df('../data/groups.json', encoding='utf-8')
df_groups.columns = ['g_' + str(col) for col in df_groups.columns]

# users
df_users = utl.get_df('../data/users.json', encoding='utf-8')
df_users['city'] = df_users.city.str.lower()

# venues
df_venues = utl.get_df('../data/venues.json', encoding='utf-8')
df_venues['city'] = df_venues.city.str.lower()
df_venues.columns = ['v_' + str(col) for col in df_venues.columns]

<type 'list'>
<type 'list'>
<type 'list'>
<type 'list'>


In [15]:
df_users = df_users.sort(['user_id'], ascending=[1])
df_users = df_users.merge(df_cities, on=['country', 'city'], how='left')
df_users = df_users.drop_duplicates(subset=['user_id'], keep='first')
df_users.columns = ['u_' + str(col)  for col in df_users.columns]

# 3. Features

## 3.1 Event attributes

In [16]:
df_events.describe()

Unnamed: 0,e_venue_id,e_rsvp_limit,e_created,e_time,e_duration
count,6200.0,2211.0,6200.0,6183.0,2600.0
mean,67959.885484,47.700588,1386568000000.0,1395698000000.0,24115150.0
std,488.481235,42.778604,40818860000.0,39526520000.0,65761070.0
min,67082.0,1.0,1172849000000.0,1173946000000.0,2700000.0
25%,67563.0,20.0,1361802000000.0,1371807000000.0,7200000.0
50%,67951.0,40.0,1393498000000.0,1404758000000.0,10800000.0
75%,68345.0,60.0,1420825000000.0,1427378000000.0,16200000.0
max,68812.0,500.0,1440671000000.0,1472015000000.0,1148400000.0


In [17]:
df_events = df_events.assign(e_event_id=[i+1 for i in xrange(len(df_events))])
df_events = df_events[df_events.e_time.notnull()]
df_events['e_datetime'] = pd.to_datetime((df_events['e_time']/1000).astype(int), unit='s')
df_events['e_hour'] = df_events.e_datetime.dt.hour
df_events['e_dow'] = df_events.e_datetime.dt.dayofweek
df_events['e_reg_hours'] = (df_events.e_time - df_events.e_created) / 3600000

## 3.2 Group attributes

In [18]:
df_groups.describe()

Unnamed: 0,g_created,g_lon,g_lat
count,711.0,711.0,711.0
mean,1394015000000.0,5.26917,51.965724
std,43034010000.0,0.889074,0.572717
min,1172849000000.0,3.13,50.93
25%,1372933000000.0,4.89,51.439999
50%,1407404000000.0,4.89,52.099998
75%,1426360000000.0,5.47,52.369999
max,1440612000000.0,7.2,53.23


## 3.3 User attributes

In [19]:
df_users.describe()

Unnamed: 0,u_user_id,u_lat,u_lon
count,57773.0,56965.0,56965.0
mean,38193.992955,50.574371,3.68972
std,16678.304214,8.167055,21.997665
min,9307.0,-43.533333,-170.275
25%,23751.0,51.514125,4.5
50%,38194.0,52.093813,4.916667
75%,52637.0,52.35,5.389526
max,67081.0,64.2,174.783333


In [20]:
df_users = df_users[df_users.u_lat.notnull()]

## 3.4 Venue attributes

In [21]:
df_venues.describe()

Unnamed: 0,v_venue_id,v_lon,v_lat
count,1732.0,1731.0,1731.0
mean,67947.5,4.626416,50.06003
std,500.12965,8.932724,9.779613
min,67082.0,-122.655571,-33.86534
25%,67514.75,4.489095,51.447849
50%,67947.5,4.8964,52.092468
75%,68380.25,5.277729,52.36638
max,68813.0,151.20667,59.327885


## 3.5 Feature engineering

In [22]:
df_events.shape

(6183, 15)

In [25]:
df_events.columns

Index([u'e_status', u'e_venue_id', u'e_description', u'e_rsvps',
       u'e_rsvp_limit', u'e_created', u'e_time', u'e_duration', u'e_group_id',
       u'e_name', u'e_event_id', u'e_datetime', u'e_hour', u'e_dow',
       u'e_reg_hours', u'v_city', u'v_venue_id', u'v_name', u'v_country',
       u'v_lon', u'v_lat', u'g_city', u'g_description', u'g_created',
       u'g_topics', u'g_lon', u'g_link', u'g_lat', u'g_group_id', u'g_name'],
      dtype='object')

In [24]:
df_events = df_events.merge(df_venues, left_on='e_venue_id', right_on='v_venue_id', how='left')
df_events = df_events.merge(df_groups, left_on='e_group_id', right_on='g_group_id', how='left')

In [26]:
del df_events['v_venue_id']
del df_events['g_group_id']

In [28]:
df_events_flat = utl.flatten_list(df_events, 'e_rsvps', reset_index=True)
df_events_flat = utl.flatten_dict(df_events_flat, 'e_rsvps')
df_events_flat['e_response_num'] = df_events_flat.response.apply(lambda x: engg.label_attendance(x))
df_events_flat['e_total_attend'] = df_events_flat.e_response_num + df_events_flat.guests

In [38]:
df_events_flat.isnull().sum(axis=0)

e_status               0
e_venue_id             0
e_description       2285
e_rsvp_limit       81068
e_created              0
e_time                 0
e_duration        106546
e_group_id             0
e_name                 0
e_event_id             0
e_datetime             0
e_hour                 0
e_dow                  0
e_reg_hours            0
v_city                 0
v_name                 0
v_country              0
v_lon                  0
v_lat                  0
g_city                 0
g_description       1499
g_created              0
g_topics               0
g_lon                  0
g_link                 0
g_lat                  0
g_name                 0
guests                 0
response               0
user_id                0
when                   0
e_response_num         0
e_total_attend         0
u_memberships          0
u_city                 0
u_hometown        118304
u_country              0
u_lat                  0
u_lon                  0
dtype: int64

In [36]:
df_events_flat.columns

Index([      u'e_status',     u'e_venue_id',  u'e_description',
         u'e_rsvp_limit',      u'e_created',         u'e_time',
           u'e_duration',     u'e_group_id',         u'e_name',
           u'e_event_id',     u'e_datetime',         u'e_hour',
                u'e_dow',    u'e_reg_hours',         u'v_city',
               u'v_name',      u'v_country',          u'v_lon',
                u'v_lat',         u'g_city',  u'g_description',
            u'g_created',       u'g_topics',          u'g_lon',
               u'g_link',          u'g_lat',         u'g_name',
               u'guests',       u'response',        u'user_id',
                 u'when', u'e_response_num', u'e_total_attend',
        u'u_memberships',         u'u_city',      u'u_user_id',
           u'u_hometown',      u'u_country',          u'u_lat',
                u'u_lon'],
      dtype='object')

In [32]:
df_events_flat = df_events_flat.merge(df_users, left_on='user_id', right_on='u_user_id', how='left')
df_events_flat = df_events_flat[df_events_flat.u_user_id.notnull()]
del df_events_flat['u_user_id']

In [None]:
# n_events_peruser = df_events_flat.groupby(['user_id']).agg({'label': 'sum'}).reset_index()
# print 'Total attendance: \n', df_events_flat.total_att.describe()
# print '\nResponse summary: \n', df_events_flat.response.value_counts()
# print '\nGuest count summary: \n', df_events_flat.guests.describe()
# print 'Number of events attended per user: \n', n_events_peruser.label.describe()
# print '\nStatus of events:\n', df_events_flat.status.value_counts()

In [43]:
df_events_flat['uv_dist'] = df_events_flat.apply(lambda row: engg.lat_lon_similarity(row['v_lat'], row['v_lon'], 
                                                                               row['u_lat'], row['u_lon']), axis=1)
df_events_flat['vg_dist'] = df_events_flat.apply(lambda row: engg.lat_lon_similarity(row['v_lat'], row['v_lon'], 
                                                                               row['g_lat'], row['g_lon']), axis=1)

In [45]:
df_events_flat.columns

Index([      u'e_status',     u'e_venue_id',  u'e_description',
         u'e_rsvp_limit',      u'e_created',         u'e_time',
           u'e_duration',     u'e_group_id',         u'e_name',
           u'e_event_id',     u'e_datetime',         u'e_hour',
                u'e_dow',    u'e_reg_hours',         u'v_city',
               u'v_name',      u'v_country',          u'v_lon',
                u'v_lat',         u'g_city',  u'g_description',
            u'g_created',       u'g_topics',          u'g_lon',
               u'g_link',          u'g_lat',         u'g_name',
               u'guests',       u'response',        u'user_id',
                 u'when', u'e_response_num', u'e_total_attend',
        u'u_memberships',         u'u_city',     u'u_hometown',
            u'u_country',          u'u_lat',          u'u_lon',
              u'uv_dist',        u'vg_dist'],
      dtype='object')

In [46]:
df_events_flat['group_tenure'] = (df_events_flat.e_time - df_events_flat.g_created) / 86400000

In [48]:
df_events_flat['group_tenure'].describe()

count    172312.000000
mean        638.095231
std         525.058968
min           0.090625
25%         217.157512
50%         522.269039
75%         926.105278
max        3169.688449
Name: group_tenure, dtype: float64

In [None]:
df_agg = df_events_flat.groupby(['event_id', 'event_hour', 'event_day_of_week', 'reg_time']) \
                       .agg({'uv_sim' : 'mean', 'total_att' : 'sum'}).reset_index()
df_agg['event_hour_1'] = df_agg.event_hour.apply(lambda x: engg.event_hour_transform(x))
df_agg['day_1'] = np.where(df_agg.event_day_of_week==1, 1, 0)
df_agg['day_2'] = np.where(df_agg.event_day_of_week==2, 1, 0)
df_agg['day_3'] = np.where(df_agg.event_day_of_week==3, 1, 0)
df_agg['day_4'] = np.where(df_agg.event_day_of_week==4, 1, 0)
df_agg['day_5'] = np.where(df_agg.event_day_of_week==5, 1, 0)
df_agg['day_6'] = np.where(df_agg.event_day_of_week==6, 1, 0)
df_agg = df_agg.dropna(axis=0, how='any')
df_data = df_agg.drop(['event_id', 'event_hour', 'event_day_of_week'], axis=1)

In [None]:
X = df_data.drop('total_att', axis=1)
Y = df_data.total_att
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
X.corr(method='pearson')

In [None]:
lm = linear_model.LinearRegression()
lm.fit(X=X_train, y=y_train)
# scores = cross_val_score(lm, X_train, y_train, cv=5)
# predicted = cross_val_predict(lm, X_test, y_test, cv=5)
predicted = lm.predict(X=X_test)
metrics.r2_score(y_test, predicted)

In [None]:
df_result = pd.DataFrame({'target': y_test, 'predict': predicted})

In [None]:
X_train_pred = lm.predict(X_train)
X_test_pred = lm.predict(X_test)
plt.scatter(X_train_pred, X_train_pred-y_train, c='b', s=30, alpha=0.5, label='train')
plt.scatter(X_test_pred, X_test_pred-y_test, c='g', s=30, alpha=0.5, label='test')
plt.hlines(y=0, xmin=0, xmax=80)
plt.legend(loc='lower right')
plt.title('Residual plot of training and testing process')
plt.show()

In [None]:
clf = tree.DecisionTreeRegressor(max_depth=3)
clf = clf.fit(X_train, y_train)
predict = clf.predict(X_test)
print "R2 is ", metrics.r2_score(y_test, predict)

In [None]:
engg.visualize_tree(clf, X.columns)

In [None]:
with open("attend_reg.txt", "w") as f:
    f = tree.export_graphviz(clf, out_file=f, feature_names=features)