In [418]:
import os
import re
import glob
import difflib
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date
from dateutil import relativedelta
from sklearn import cluster
%matplotlib inline

# LOAD DATA TO DICTIONARY

In [419]:
path = 'C:/analyticsdev/Projects/EXPECTEDX/Alloe/data'
files = glob.glob(os.path.join(path, "*.csv"))

raw_data = [pd.read_csv(name) for name in files]
df_names = [re.findall('(?<=\.)[a-zA-Z]+(?=\.)', names) for names in files]
df_names = itertools.chain(*df_names)
raw_data = dict(zip(df_names, raw_data))

# WRANGLING FUNCS

In [420]:
answer = raw_data['answer']
answer['survey'] = answer['survey'].apply(lambda x: re.split(' (?=@)',x)[0])
answer['user'] = answer['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [421]:
answer = answer.pivot_table(index='user', 
                            columns='survey', 
                            values='answer', 
                            aggfunc=lambda x: ' '.join(str(v) for v in x))
answer = answer.add_suffix('_surv')

In [422]:
#user = raw_data['user'][['_id',
 #                        'active',
  #                       'dob',
   #                      'gender',
    #                     'heightFeet',
     #                    'heightInches',
      #                   'joined',
       #                  'weight']]

In [423]:
#main = user.merge(answer, 
 #                 how='left', 
  #                left_on='_id', 
   #               right_index=True, 
    #              sort=True)

In [424]:
main = raw_data['user'][['_id',
                         'active',
                         'dob',
                         'gender',
                         'heightFeet',
                         'heightInches',
                         'joined',
                         'weight']]

In [425]:
main['dob'] = main['dob'].apply(lambda x: pd.to_datetime(x))
main['joined'] = main['joined'].apply(lambda x: pd.to_datetime(x))

main['age'] = main['dob'].apply(lambda x: relativedelta.relativedelta(date.today(), datetime.date(x)).years)
main['act_time'] = main['joined'].apply(lambda x: (date.today() - datetime.date(x)).days)

In [426]:
chal_inv = raw_data['challengeInvitation']
chal_inv['challenge'] = chal_inv['challenge'].apply(lambda x: re.split(' (?=@)',x)[0])
chal_inv['friend'] = chal_inv['friend'].apply(lambda x: re.split(' (?=@)',x)[0])
chal_inv['user'] = chal_inv['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [427]:
chal_inv = chal_inv.pivot_table(index='user',
                                columns='challenge', 
                                values='friend',
                                aggfunc=lambda x: ' '.join(str(v) for v in x))
chal_inv = chal_inv.add_suffix('_chalinv')

In [428]:
#main = main.merge(chal_inv,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

Don't understand relationships between "challenger" table and "challengeInvitation" table 'user' column

In [429]:
frnd_rq = raw_data['friendship']
frnd_rq['friend'] = frnd_rq['friend'].apply(lambda x: re.split(' (?=@)',x)[0])
frnd_rq['user'] = frnd_rq['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [430]:
frnd_rq = frnd_rq.pivot_table(index='user',
                              columns='friend',
                              values='status',
                              aggfunc=lambda x: ' '.join(str(v) for v in x))
frnd_rq = frnd_rq.add_suffix('_friendid')

In [431]:
#main = main.merge(frnd_rq,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

In [432]:
event_ast = raw_data['assistantEvent']
event_ast['event'] = event_ast['event'].apply(lambda x: re.split(' (?=@)',x)[0])
event_ast['user'] = event_ast['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [433]:
event_ast = event_ast.pivot_table(index='user',
                                  columns='event',
                                  values='_id',
                                  aggfunc=lambda x: ' '.join(str(v) for v in x))
event_ast = event_ast.add_suffix('_eventid')

In [434]:
#main = main.merge(event_ast,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

In [435]:
post = raw_data['post']
post['user'] = post['user'].apply(lambda x: re.split(' (?=@)',x)[0])
post['item._class'] = post['item._class'].apply(lambda x: re.sub('[a-z].+\.(?=[A-Z])', '', str(x)))

In [436]:
post['counter'] = int(1)
post = post.pivot_table(index='user',
                        columns='item._class',
                        values='counter',
                        aggfunc='sum')

In [437]:
main = main.merge(post,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

In [438]:
metrics_miles_time = raw_data['post'][['user','item.miles','item.time','item.positions']]
metrics_miles_time = metrics_miles_time.groupby(metrics_miles_time.user).sum()

metrics_exercise = raw_data['post'][['user','item.exercise']]
metrics_exercise['counter'] = int(1)

metrics_chals_sent = pd.DataFrame(chal_inv.count(axis=1), columns={'chal_sent'})
metrics_chals_unique = pd.DataFrame(chal_inv.nunique(axis=1), columns={'chal_unique'})
metrics_chals_uratio = metrics_chals_sent.join(metrics_chals_unique)
metrics_chals_uratio['chal_uni_frnd'] = metrics_chals_uratio['chal_sent']/metrics_chals_uratio['chal_unique']

metrics_frnd_sent = pd.DataFrame(frnd_rq.count(axis=1), columns={'frnd_sent'})

In [439]:
metrics_exercise = metrics_exercise.pivot_table(index='user',
                                                columns='item.exercise',
                                                values='counter',
                                                aggfunc='sum')

In [440]:
main = main.merge(metrics_miles_time,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(metrics_exercise,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(metrics_chals_uratio,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(frnd_rq.apply(pd.value_counts, axis=1)[['ACCEPTED','PENDING','WAITING']],
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main.drop(353, inplace=True) #Record is all NaN
main.drop(['dob','joined','CompanyMessage'], axis=1, inplace=True)

In [441]:
#event = main.filter(regex='_eventid')
#friend = main.filter(regex='_friendid')

GIVE 'EVENT' AND 'FRIEND' THEIR OWN TABLES AND SCORE THEM ADDED TO 'MAIN' DATA FRAME

EXTRA COLUMNS LIKE "ACCEPTED ACCEPTED" HAVE A 1 OCCASIONALLY. NOT SURE IF ISSUE WITH RAW DATA OR 'APPLY' FUNCTION

In [442]:
split = np.split(main, [8], axis=1)
nas = split[1].isnull()

In [443]:
split[1][nas] = 0

In [444]:
main = split[0].join(split[1])

In [445]:
main = main.fillna(main.mean(), inplace=True).dropna()

In [451]:
main = main.join(pd.get_dummies(main['gender']))

In [453]:
clust_set = main.ix[:,'heightFeet':]

In [454]:
clust_set

Unnamed: 0,heightFeet,heightInches,weight,age,act_time,BloodPressureCheckIn,Challenger,CholesterolCheckIn,EventCheckIn,FitbitCheckIn,...,SWIMMING,WEIGHTS,chal_sent,chal_unique,chal_uni_frnd,ACCEPTED,PENDING,WAITING,FEMALE,MALE
185,5,3,143.75,35,976,3,11,2,2,0,...,0,0,1,1,1,1,3,0,1,0
134,5,11,170,45,976,0,15,2,1,0,...,0,0,2,1,2,18,7,0,0,1
198,5,11,191,33,976,10,12,1,0,0,...,0,1,3,2,1.5,6,0,1,0,1
43,5,10,180,42,976,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,1
315,5,11,185.8,27,976,1,13,0,5,77,...,0,0,3,2,1.5,47,1,0,0,1
15,5,10,185,31,975,1,1,0,0,0,...,0,0,0,0,0,0,26,0,0,1
446,5,2,135,41,975,1,26,0,4,21,...,66,3,3,3,1,222,0,81,1,0
21,5,8,118,30,975,0,15,0,58,7,...,0,0,2,1,2,29,2,1,1,0
20,6,0,200,27,975,0,3,0,0,0,...,0,0,1,1,1,9,19,0,0,1
1,5,7,110,24,975,0,1,0,0,0,...,0,0,1,1,1,12,11,0,1,0
