In [178]:
import os
import re
import glob
import difflib
import pandas as pd
import numpy as np
import itertools
from datetime import datetime, date
from dateutil import relativedelta

# LOAD DATA TO DICTIONARY

In [179]:
path = 'C:/analyticsdev/Projects/EXPECTEDX/Alloe/data'
files = glob.glob(os.path.join(path, "*.csv"))

raw_data = [pd.read_csv(name) for name in files]
df_names = [re.findall('(?<=\.)[a-zA-Z]+(?=\.)', names) for names in files]
df_names = itertools.chain(*df_names)
raw_data = dict(zip(df_names, raw_data))

# WRANGLING FUNCS

In [180]:
answer = raw_data['answer']
answer['survey'] = answer['survey'].apply(lambda x: re.split(' (?=@)',x)[0])
answer['user'] = answer['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [181]:
answer = answer.pivot_table(index='user', 
                            columns='survey', 
                            values='answer', 
                            aggfunc=lambda x: ' '.join(str(v) for v in x))
answer = answer.add_suffix('_surv')

In [212]:
user = raw_data['user'][['_class',
                         '_id',
                         'active',
                         'dob','gender',
                         'heightFeet',
                         'heightInches',
                         'joined',
                         'weight']]

In [213]:
main = user.merge(answer, 
                  how='left', 
                  left_on='_id', 
                  right_index=True, 
                  sort=True)

In [327]:
main['dob'] = main['dob'].apply(lambda x: pd.to_datetime(x))
main['joined'] = main['joined'].apply(lambda x: pd.to_datetime(x))

main['age'] = main['dob'].apply(lambda x: relativedelta.relativedelta(date.today(), datetime.date(x)).years)
main['act_time'] = main['joined'].apply(lambda x: (date.today() - datetime.date(x)).days)

In [184]:
chal_inv = raw_data['challengeInvitation']
chal_inv['challenge'] = chal_inv['challenge'].apply(lambda x: re.split(' (?=@)',x)[0])
chal_inv['friend'] = chal_inv['friend'].apply(lambda x: re.split(' (?=@)',x)[0])
chal_inv['user'] = chal_inv['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [185]:
chal_inv = chal_inv.pivot_table(index='user',
                                columns='challenge', 
                                values='friend',
                                aggfunc=lambda x: ' '.join(str(v) for v in x))
chal_inv = chal_inv.add_suffix('_chalinv')

In [186]:
main = main.merge(chal_inv,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

Don't understand relationships between "challenger" table and "challengeInvitation" table 'user' column

In [187]:
frnd_rq = raw_data['friendship']
frnd_rq['friend'] = frnd_rq['friend'].apply(lambda x: re.split(' (?=@)',x)[0])
frnd_rq['user'] = frnd_rq['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [188]:
frnd_rq = frnd_rq.pivot_table(index='user',
                              columns='friend',
                              values='status',
                              aggfunc=lambda x: ' '.join(str(v) for v in x))
frnd_rq = frnd_rq.add_suffix('_friendid')

In [189]:
#main = main.merge(frnd_rq,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

In [190]:
event_ast = raw_data['assistantEvent']
event_ast['event'] = event_ast['event'].apply(lambda x: re.split(' (?=@)',x)[0])
event_ast['user'] = event_ast['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [191]:
event_ast = event_ast.pivot_table(index='user',
                                  columns='event',
                                  values='_id',
                                  aggfunc=lambda x: ' '.join(str(v) for v in x))
event_ast = event_ast.add_suffix('_eventid')

In [192]:
#main = main.merge(event_ast,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

In [193]:
post = raw_data['post']
post['user'] = post['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [194]:
post['counter'] = int(1)
post = post.pivot_table(index='user',
                        columns='item._class',
                        values='counter',
                        aggfunc='sum')
post = post.add_suffix('_posttype')

In [195]:
main = main.merge(post,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

In [196]:
metrics_miles_time = raw_data['post'][['user','item.miles','item.time','item.positions']]
metrics_miles_time = metrics_miles_time.groupby(metrics_miles_time.user).sum()

metrics_exercise = raw_data['post'][['user','item.exercise']]
metrics_exercise['counter'] = int(1)

metrics_chals_sent = pd.DataFrame(chal_inv.count(axis=1), columns={'chal_sent'})
metrics_chals_unique = pd.DataFrame(chal_inv.nunique(axis=1), columns={'chal_unique'})
metrics_chals_uratio = metrics_chals_sent.join(metrics_chals_unique)
metrics_chals_uratio['chal_uni_frnd'] = metrics_chals_uratio['chal_sent']/metrics_chals_uratio['chal_unique']

metrics_frnd_sent = pd.DataFrame(frnd_rq.count(axis=1), columns={'frnd_sent'})

In [197]:
metrics_exercise = metrics_exercise.pivot_table(index='user',
                                                columns='item.exercise',
                                                values='counter',
                                                aggfunc='sum')

In [198]:
main = main.merge(metrics_miles_time,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(metrics_exercise,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(metrics_chals_uratio,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(frnd_rq.apply(pd.value_counts, axis=1)[['ACCEPTED','PENDING','WAITING']],
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

In [199]:
event = main.filter(regex='_eventid')
friend = main.filter(regex='_friendid')

In [328]:
main

Unnamed: 0,_class,_id,active,dob,gender,heightFeet,heightInches,joined,weight,55e098be0cf2f093a4371455_surv,568db6ef0cf2f21416e38ffe_surv,569499080cf2f21416e38fff_surv,57042f530cf2be1448154575_surv,57326d630cf268b0d7dc7db9_surv,582b9b9a0cf23ffbe2bbd190_surv,582b9da00cf23ffbe2bbd191_surv,age,act_time
185,com.litekey.alloe.model.User,54f530089932b01dbcffc89b,True,1982-03-02,FEMALE,5.0,3.0,2015-03-18 12:00:00.000,143.75,,,,,,,,35,970
134,com.litekey.alloe.model.User,54f530c89932b01dbcffc89c,True,1972-03-02,MALE,5.0,11.0,2015-03-18 12:00:00.000,170.00,,,,,,,,45,970
110,com.litekey.alloe.model.User,54f5ef739932b01dbcffc8a2,True,2015-03-22,MALE,6.0,,2015-03-18 12:00:00.000,194.90,,,,,,,,2,970
198,com.litekey.alloe.model.User,54f757f4993241fc498ec5d7,True,1984-08-25,MALE,5.0,11.0,2015-03-18 12:00:00.000,191.00,,,,,,,,33,970
228,com.litekey.alloe.model.User,54f8f6f59932ce1b71e7317a,True,1983-09-17,,,,2015-03-18 12:00:00.000,,,,,,,,,34,970
43,com.litekey.alloe.model.User,54fe33869932ce1b71e73193,True,1975-03-09,MALE,5.0,10.0,2015-03-18 12:00:00.000,180.00,,,,,,,,42,970
53,com.litekey.alloe.model.User,54fe49989932ce1b71e73194,True,2011-03-09,MALE,,,2015-03-18 12:00:00.000,,,,,,,,,6,970
0,com.litekey.alloe.model.User,5509a55b993212d9d6de6e35,True,1986-02-27,,,,2015-03-18 16:18:35.099,,,,,,,,,31,970
315,com.litekey.alloe.model.User,550a054e993212d9d6de6e3e,True,1990-04-26,MALE,5.0,11.0,2015-03-18 23:07:58.017,185.80,Fitbit,,,,,,,27,970
15,com.litekey.alloe.model.User,550a1875993212d9d6de6e4e,True,1986-05-15,MALE,5.0,10.0,2015-03-19 00:29:41.654,185.00,,,,,,,,31,969


GIVE 'EVENT' AND 'FRIEND' THEIR OWN TABLES AND SCORE THEM ADDED TO 'MAIN' DATA FRAME

EXTRA COLUMNS LIKE "ACCEPTED ACCEPTED" HAVE A 1 OCCASIONALLY. NOT SURE IF ISSUE WITH RAW DATA OR 'APPLY' FUNCTION

970