In [2]:
import os
import re
import glob
import difflib
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date
from dateutil import relativedelta
from sklearn import preprocessing, metrics
from sklearn.cluster import AgglomerativeClustering, KMeans, MeanShift, FeatureAgglomeration
%matplotlib inline

# LOAD DATA TO DICTIONARY

In [3]:
path = 'C:/analyticsdev/Projects/EXPECTEDX/Alloe/data'
files = glob.glob(os.path.join(path, "*.csv"))

raw_data = [pd.read_csv(name) for name in files]
df_names = [re.findall('(?<=\.)[a-zA-Z]+(?=\.)', names) for names in files]
df_names = itertools.chain(*df_names)
raw_data = dict(zip(df_names, raw_data))

# WRANGLING FUNCS

In [4]:
answer = raw_data['answer']
answer['survey'] = answer['survey'].apply(lambda x: re.split(' (?=@)',x)[0])
answer['user'] = answer['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [5]:
answer = answer.pivot_table(index='user', 
                            columns='survey', 
                            values='answer', 
                            aggfunc=lambda x: ' '.join(str(v) for v in x))
answer = answer.add_suffix('_surv')

In [6]:
#user = raw_data['user'][['_id',
 #                        'active',
  #                       'dob',
   #                      'gender',
    #                     'heightFeet',
     #                    'heightInches',
      #                   'joined',
       #                  'weight']]

In [7]:
#main = user.merge(answer, 
 #                 how='left', 
  #                left_on='_id', 
   #               right_index=True, 
    #              sort=True)

In [4]:
main = raw_data['user'][['_id',
                         'active',
                         'dob',
                         'gender',
                         'heightFeet',
                         'heightInches',
                         'joined',
                         'weight']]

In [9]:
main['dob'] = main['dob'].apply(lambda x: pd.to_datetime(x))
main['joined'] = main['joined'].apply(lambda x: pd.to_datetime(x))

main['age'] = main['dob'].apply(lambda x: relativedelta.relativedelta(date.today(), datetime.date(x)).years)
main['act_time'] = main['joined'].apply(lambda x: (date.today() - datetime.date(x)).days)

In [10]:
chal_inv = raw_data['challengeInvitation']
chal_inv['challenge'] = chal_inv['challenge'].apply(lambda x: re.split(' (?=@)',x)[0])
chal_inv['friend'] = chal_inv['friend'].apply(lambda x: re.split(' (?=@)',x)[0])
chal_inv['user'] = chal_inv['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [11]:
chal_inv = chal_inv.pivot_table(index='user',
                                columns='challenge', 
                                values='friend',
                                aggfunc=lambda x: ' '.join(str(v) for v in x))
chal_inv = chal_inv.add_suffix('_chalinv')

In [12]:
#main = main.merge(chal_inv,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

Don't understand relationships between "challenger" table and "challengeInvitation" table 'user' column

In [13]:
frnd_rq = raw_data['friendship']
frnd_rq['friend'] = frnd_rq['friend'].apply(lambda x: re.split(' (?=@)',x)[0])
frnd_rq['user'] = frnd_rq['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [14]:
frnd_rq = frnd_rq.pivot_table(index='user',
                              columns='friend',
                              values='status',
                              aggfunc=lambda x: ' '.join(str(v) for v in x))
frnd_rq = frnd_rq.add_suffix('_friendid')

In [15]:
#main = main.merge(frnd_rq,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

In [16]:
event_ast = raw_data['assistantEvent']
event_ast['event'] = event_ast['event'].apply(lambda x: re.split(' (?=@)',x)[0])
event_ast['user'] = event_ast['user'].apply(lambda x: re.split(' (?=@)',x)[0])

In [17]:
event_ast = event_ast.pivot_table(index='user',
                                  columns='event',
                                  values='_id',
                                  aggfunc=lambda x: ' '.join(str(v) for v in x))
event_ast = event_ast.add_suffix('_eventid')

In [18]:
#main = main.merge(event_ast,
 #                 how='left',
  #                left_on='_id',
   #               right_index=True,
    #              sort=True)

In [19]:
post = raw_data['post']
post['user'] = post['user'].apply(lambda x: re.split(' (?=@)',x)[0])
post['item._class'] = post['item._class'].apply(lambda x: re.sub('[a-z].+\.(?=[A-Z])', '', str(x)))

In [20]:
post['counter'] = int(1)
post = post.pivot_table(index='user',
                        columns='item._class',
                        values='counter',
                        aggfunc='sum')

In [21]:
main = main.merge(post,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

In [22]:
metrics_miles_time = raw_data['post'][['user','item.miles','item.time','item.positions']]
metrics_miles_time = metrics_miles_time.groupby(metrics_miles_time.user).sum()

metrics_exercise = raw_data['post'][['user','item.exercise']]
metrics_exercise['counter'] = int(1)

metrics_chals_sent = pd.DataFrame(chal_inv.count(axis=1), columns={'chal_sent'})
metrics_chals_unique = pd.DataFrame(chal_inv.nunique(axis=1), columns={'chal_unique'})
metrics_chals_uratio = metrics_chals_sent.join(metrics_chals_unique)
metrics_chals_uratio['chal_uni_frnd'] = metrics_chals_uratio['chal_sent']/metrics_chals_uratio['chal_unique']

metrics_frnd_sent = pd.DataFrame(frnd_rq.count(axis=1), columns={'frnd_sent'})

In [23]:
metrics_exercise = metrics_exercise.pivot_table(index='user',
                                                columns='item.exercise',
                                                values='counter',
                                                aggfunc='sum')

In [24]:
main = main.merge(metrics_miles_time,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(metrics_exercise,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(metrics_chals_uratio,
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main = main.merge(frnd_rq.apply(pd.value_counts, axis=1)[['ACCEPTED','PENDING','WAITING']],
                  how='left',
                  left_on='_id',
                  right_index=True,
                  sort=True)

main.drop(353, inplace=True) #Record is all NaN
main.drop(['dob','joined','CompanyMessage'], axis=1, inplace=True)

In [25]:
#event = main.filter(regex='_eventid')
#friend = main.filter(regex='_friendid')

GIVE 'EVENT' AND 'FRIEND' THEIR OWN TABLES AND SCORE THEM ADDED TO 'MAIN' DATA FRAME

EXTRA COLUMNS LIKE "ACCEPTED ACCEPTED" HAVE A 1 OCCASIONALLY. NOT SURE IF ISSUE WITH RAW DATA OR 'APPLY' FUNCTION

In [26]:
split = np.split(main, [8], axis=1)
nas = split[1].isnull()
split[1][nas] = 0

In [27]:
main = split[0].join(split[1])
main = main.join(pd.get_dummies(main['gender']))

In [28]:
clust_set = main.loc[:,'heightFeet':]
clust_set.fillna(clust_set.median(), inplace=True)

Unnamed: 0,heightFeet,heightInches,weight,age,act_time,BloodPressureCheckIn,Challenger,CholesterolCheckIn,EventCheckIn,FitbitCheckIn,...,SWIMMING,WEIGHTS,chal_sent,chal_unique,chal_uni_frnd,ACCEPTED,PENDING,WAITING,FEMALE,MALE
185,5.0,3.0,143.75,35,978,3.0,11.0,2.0,2.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,3.0,0.0,1,0
134,5.0,11.0,170.00,45,978,0.0,15.0,2.0,1.0,0.0,...,0.0,0.0,2.0,1.0,2.0,18.0,7.0,0.0,0,1
110,6.0,6.0,194.90,2,978,0.0,8.0,1.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0,1
198,5.0,11.0,191.00,33,978,10.0,12.0,1.0,0.0,0.0,...,0.0,1.0,3.0,2.0,1.5,6.0,0.0,1.0,0,1
228,5.0,6.0,160.00,34,978,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
43,5.0,10.0,180.00,42,978,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0,1
53,5.0,6.0,160.00,6,978,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,1
0,5.0,6.0,160.00,31,978,0.0,5.0,0.0,1.0,0.0,...,0.0,0.0,4.0,1.0,4.0,30.0,10.0,3.0,0,0
315,5.0,11.0,185.80,27,978,1.0,13.0,0.0,5.0,77.0,...,0.0,0.0,3.0,2.0,1.5,47.0,1.0,0.0,0,1
15,5.0,10.0,185.00,31,977,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0,1


In [41]:
clust_set.describe()
list(clust_set)

['heightFeet',
 'heightInches',
 'weight',
 'age',
 'act_time',
 'BloodPressureCheckIn',
 'Challenger',
 'CholesterolCheckIn',
 'EventCheckIn',
 'FitbitCheckIn',
 'FoodCheckIn',
 'GlucoseCheckIn',
 'GymCheckIn',
 'MovesChallenge',
 'MovesCheckIn',
 'PictureChallenge',
 'RunningCheckIn',
 'WeightCheckIn',
 'YogaCheckIn',
 'item.miles',
 'item.time',
 'item.positions',
 'CARDIO',
 'FITNESS',
 'OTHER',
 'SWIMMING',
 'WEIGHTS',
 'chal_sent',
 'chal_unique',
 'chal_uni_frnd',
 'ACCEPTED',
 'PENDING',
 'WAITING',
 'FEMALE',
 'MALE',
 'AC_label']

In [30]:
ss = preprocessing.StandardScaler(with_mean=False)
mms = preprocessing.MinMaxScaler()

In [31]:
processed = pd.DataFrame(ss.fit_transform(clust_set), columns=clust_set.columns, index=clust_set.index)
mms_processed = pd.DataFrame(mms.fit_transform(clust_set), columns=clust_set.columns, index=clust_set.index)

In [42]:
processed.describe()
mms_processed.describe()

Unnamed: 0,heightFeet,heightInches,weight,age,act_time,BloodPressureCheckIn,Challenger,CholesterolCheckIn,EventCheckIn,FitbitCheckIn,...,SWIMMING,WEIGHTS,chal_sent,chal_unique,chal_uni_frnd,ACCEPTED,PENDING,WAITING,FEMALE,MALE
count,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,...,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0
mean,0.04397,0.088858,0.025789,0.448426,0.59867,0.011356,0.058083,0.005085,0.027986,0.010302,...,0.004242,0.019184,0.055932,0.068362,0.05565,0.033975,0.093785,0.011723,0.4,0.555932
std,0.043246,0.061735,0.040536,0.137027,0.313436,0.051693,0.100873,0.059652,0.097229,0.062288,...,0.047437,0.08971,0.148988,0.173833,0.143493,0.084512,0.14337,0.068161,0.490314,0.497283
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.038462,0.058824,0.021488,0.4,0.319121,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.038462,0.088235,0.023604,0.442857,0.60911,0.0,0.038462,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004505,0.027778,0.0,0.0,1.0
75%,0.038462,0.117647,0.026046,0.5,0.939619,0.0,0.076923,0.0,0.005814,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027027,0.166667,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
processed

Unnamed: 0,heightFeet,heightInches,weight,age,act_time,BloodPressureCheckIn,Challenger,CholesterolCheckIn,EventCheckIn,FitbitCheckIn,...,SWIMMING,WEIGHTS,chal_sent,chal_unique,chal_uni_frnd,ACCEPTED,PENDING,WAITING,FEMALE,MALE
185,2.225310,0.715237,0.577769,3.652020,3.308162,5.808401,4.197726,11.185462,0.119695,0.000000,...,0.000000,0.000000,1.343531,1.919179,1.743721,0.053345,0.581741,0.000000,2.041241,0.000000
134,2.225310,2.622535,0.683275,4.695454,3.308162,0.000000,5.724171,11.185462,0.059847,0.000000,...,0.000000,0.000000,2.687063,1.919179,3.487441,0.960216,1.357395,0.000000,0.000000,2.012632
110,2.670372,1.430474,0.783355,0.208687,3.308162,0.000000,3.052891,5.592731,0.000000,0.626033,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.426763,0.000000,0.050985,0.000000,2.012632
198,2.225310,2.622535,0.767680,3.443333,3.308162,19.361337,4.579337,5.592731,0.000000,0.000000,...,0.000000,0.027752,4.030594,3.838358,2.615581,0.320072,0.000000,0.050985,0.000000,2.012632
228,2.225310,1.430474,0.643082,3.547676,3.308162,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.053345,0.000000,0.000000,0.000000,0.000000
43,2.225310,2.384123,0.723468,4.382424,3.308162,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.160036,0.000000,0.000000,0.000000,2.012632
53,2.225310,1.430474,0.643082,0.626061,3.308162,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.053345,0.193914,0.000000,0.000000,2.012632
0,2.225310,1.430474,0.643082,3.234646,3.308162,0.000000,1.908057,0.000000,0.059847,0.000000,...,0.000000,0.000000,5.374126,1.919179,6.974882,1.600360,1.939135,0.152954,0.000000,0.000000
315,2.225310,2.622535,0.746779,2.817272,3.308162,1.936134,4.960948,0.000000,0.299237,16.068182,...,0.000000,0.000000,4.030594,3.838358,2.615581,2.507231,0.193914,0.000000,0.000000,2.012632
15,2.225310,2.384123,0.743564,3.234646,3.304779,1.936134,0.381611,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.041751,0.000000,0.000000,2.012632


In [34]:
from sklearn.decomposition import PCA, SparsePCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [35]:
fa = FeatureAgglomeration(n_clusters=5, affinity='cosine', linkage='complete')
feats = fa.fit_transform(processed)
test = pd.DataFrame(feats, index=processed.index)
agg_model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
model = agg_model.fit(test).labels_

In [36]:
clust_set['AC_label'] = model

In [37]:
clust_set.groupby('AC_label').describe().T

Unnamed: 0,AC_label,0,1,2
ACCEPTED,count,576.000000,7.000000,7.000000
ACCEPTED,mean,6.725694,76.857143,5.428571
ACCEPTED,std,15.774314,70.664533,6.373307
ACCEPTED,min,0.000000,20.000000,0.000000
ACCEPTED,25%,0.000000,36.500000,0.500000
ACCEPTED,50%,1.000000,47.000000,5.000000
ACCEPTED,75%,6.000000,88.000000,7.000000
ACCEPTED,max,152.000000,222.000000,18.000000
BloodPressureCheckIn,count,576.000000,7.000000,7.000000
BloodPressureCheckIn,mean,0.088542,0.428571,1.857143


In [38]:
lda = LinearDiscriminantAnalysis()
clust_lda = lda.fit_transform(clust_set, clust_set['AC_label'])

In [39]:
pd.DataFrame(lda.coef_, columns=clust_set.columns).describe()

Unnamed: 0,heightFeet,heightInches,weight,age,act_time,BloodPressureCheckIn,Challenger,CholesterolCheckIn,EventCheckIn,FitbitCheckIn,...,WEIGHTS,chal_sent,chal_unique,chal_uni_frnd,ACCEPTED,PENDING,WAITING,FEMALE,MALE,AC_label
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,-0.096924,0.327792,0.000235,-0.063629,-0.000534,-18.017789,-0.419422,26.416113,0.092953,0.947112,...,-0.253809,-6.042737,7.046046,1.84373,-0.010611,0.108462,0.023851,2.697567,4.16439,0.0
std,0.178716,0.478974,0.000801,0.129969,0.002677,27.996363,1.380976,58.054271,0.083713,1.94238,...,0.228159,6.779654,11.834308,1.655664,0.428099,0.231463,0.160341,5.288631,6.475892,0.0
min,-0.303264,-0.012098,-0.000416,-0.213353,-0.003444,-50.206755,-1.965496,-12.875118,-0.003431,-0.308136,...,-0.395891,-13.239965,-0.260047,-0.068046,-0.444106,-0.045278,-0.122688,-0.605114,-0.153695,0.0
25%,-0.149844,0.053893,-0.000212,-0.105502,-0.001712,-27.359174,-0.975008,-6.925027,0.065678,-0.171546,...,-0.385397,-9.175614,0.219108,1.362027,-0.221857,-0.024641,-0.061784,-0.352336,0.441349,0.0
50%,0.003577,0.119884,-9e-06,0.002348,2e-05,-4.511593,0.01548,-0.974936,0.134787,-0.034955,...,-0.374903,-5.111263,0.698263,2.7921,0.000392,-0.004003,-0.00088,-0.099559,1.036392,0.0
75%,0.006246,0.497736,0.000561,0.011233,0.000921,-1.923306,0.353615,46.061729,0.141145,1.574736,...,-0.182768,-2.444122,10.699093,2.799618,0.206136,0.185333,0.09712,4.348907,6.323432,0.0
max,0.008915,0.875589,0.00113,0.020117,0.001823,0.66498,0.69175,93.098393,0.147503,3.184426,...,0.009367,0.223018,20.699923,2.807135,0.411881,0.374668,0.195121,8.797373,11.610472,0.0


## DRAFT SCRIPTS

In [40]:
agg = cluster.AgglomerativeClustering(n_clusters=4, affinity='cosine', linkage='complete')
labels = agg.fit_predict(processed)
clust_set['Agg_Clust'] = labels
clust_set['Agg_Clust'].value_counts()

NameError: name 'cluster' is not defined

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Agg_Clust', y='heightInches', data=clust_set)

In [None]:
sns.pairplot(data=processed.iloc[:,5:20])

In [None]:
dbs = cluster.DBSCAN(eps=2, min_samples=6)
labels = dbs.fit_predict(processed)
clust_set['DBS_Clust'] = labels
clust_set['DBS_Clust'].value_counts()

In [None]:
clust_set

In [None]:
sc = cluster.SpectralClustering(n_clusters=4)
labels = sc.fit_predict(processed)
clust_set['SC_Clust'] = labels
clust_set['SC_Clust'].value_counts()

In [None]:
km = cluster.KMeans(n_clusters=4)
labels = km.fit_predict(processed)
clust_set['KM_Clust'] = labels
clust_set['KM_Clust'].value_counts()

In [None]:
clust_set.columns

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Agg_Clust', y='chal_uni_frnd', data=clust_set)

In [None]:
metrics.silhouette_score(clust_set, labels=clust_set['KM_Clust'], metric='cosine')

In [None]:
metrics.calinski_harabaz_score(clust_set, labels=clust_set['KM_Clust'])

In [None]:
metrics.calinski_harabaz_score(clust_set, labels=clust_set['KM_Clust'])