## Data from mongoDB 

Data obtained from Alloe mondoDB on 10.18.17 via the Studio 3T GUI. In the future many of these initial table joins might be more appropriately performed within the mongoDB environment rather than in Python/Jupyter.

In [6]:
import os
import re
import glob
import itertools
import pandas as pd

#Loads 23 tables into memory
path = 'C:/analyticsdev/Projects/EXPECTEDX/Alloe/data'
files = glob.glob(os.path.join(path, "*.csv"))

#Stores 23 tables in dictionary with file name as key
raw_data = [pd.read_csv(name) for name in files]
df_names = [re.findall('(?<=\.)[a-zA-Z]+(?=\.)', names) for names in files]
df_names = itertools.chain(*df_names)
raw_data = dict(zip(df_names, raw_data))

mongoDB contains 23 collections (i.e. tables) of Alloe user and usage data. Tables belong to one of three categories for this exercise: data, meta, uninformative.

* **Data** - Usable data for segmentation purposes.
* **Meta** - Data describing what is found in "Data" tables with more detail.
* **Uninformative** - Data regarding Alloe security, device connections, etc.

## Functions

In [87]:
aggfunc = lambda x: ' '.join(str(v) for v in x)

def make_tables(table, clean_cols, piv_col, piv_val, suffix, piv_index='user', piv_func=aggfunc, counter=False):
    df = raw_data[table]
    #Add counter for summing CheckIns
    if counter:
        df['counter'] = int(1)
    #Remove "@..." from id columns    
    for col in clean_cols:
        df[col] = df[col].apply(lambda x: re.split(' (?=@)',x)[0])
    df = df.pivot_table(index=piv_index, 
                        columns=piv_col, 
                        values=piv_val, 
                        aggfunc=piv_func)
    df = df.add_suffix(suffix)
    return df

In [131]:
def merge_tables(tables, usertable=user):
    df = usertable
    for table in tables:
        df.merge(table,
                 how='left',
                 right_index=True,
                 sort=True)
    return df

## Preprocessing

In [79]:
#Prepare 'Data' tables
answer = make_tables('answer', 
                     clean_cols=['survey','user'], 
                     piv_col='survey', 
                     piv_val='answer',
                     suffix='_surv')

chal_inv = make_tables('challengeInvitation',
                       clean_cols=['challenge','friend','user'],
                       piv_col='challenge',
                       piv_val='friend',
                       suffix='_chalinv')

frnd_rq = make_tables('friendship',
                      clean_cols=['friend','user'],
                      piv_col='friend',
                      piv_val='status',
                      suffix='_frndid')

event_ast = make_tables('assistantEvent',
                        clean_cols=['event','user'],
                        piv_col='event',
                        piv_val='_id',
                        suffix='_evntid')

post = make_tables('post',
                   clean_cols=['user','item._class'],
                   piv_col='item._class',
                   piv_val='counter',
                   piv_func='sum',
                   counter=True,
                   suffix='_post')



In [92]:
#Create user 'Data' table and append new 'age' and 'active time' variables

from datetime import datetime, date
from dateutil import relativedelta

user = raw_data['user'][['_id',
                         'active',
                         'dob',
                         'gender',
                         'heightFeet',
                         'heightInches',
                         'joined',
                         'weight']]

user['dob'] = user['dob'].apply(lambda x: pd.to_datetime(x))
user['age'] = user['dob'].apply(lambda x: relativedelta.relativedelta(date.today(), datetime.date(x)).years)
user['joined'] = user['joined'].apply(lambda x: pd.to_datetime(x))
user['active_time'] = user['joined'].apply(lambda x: (date.today() - datetime.date(x)).days)

In [128]:
#Create additional tables of miles ran, activity time, and yoga positions held
act_measures = raw_data['post'][['user','item.miles','item.time','item.positions']]
act_measures =  act_measures.groupby(act_measures['user']).sum()

#Count exercise types
exercises = raw_data['post'][['user','item.exercise']]
exercises['counter'] = int(1)
exercises = exercises.pivot_table(index='user',
                                  columns='item.exercise',
                                  values='counter',
                                  aggfunc='sum')

#Count challenges sent, unique friends challenged, and ratio between
challenges = pd.DataFrame(chal_inv.count(axis=1), columns={'chal_sent'})
challenges = challenges.join(pd.DataFrame(chal_inv.nunique(axis=1), columns={'chal_unique'}))
challenges['chal_uni_frnd'] =  challenges['chal_sent'] / challenges['chal_unique']

#Count friend requests sent and request status
friends = pd.DataFrame(frnd_rq.count(axis=1), columns={'frnd_sent'})
friends = friends.join(frnd_rq.apply(pd.value_counts, axis=1)[['ACCEPTED','PENDING','WAITING']])

## Segmentation

## Evaluation