In [13]:
# interesting articles:
# http://leananalyticsbook.com/one-metric-that-matters/
# https://v4-alpha.getbootstrap.com/content/code/
# https://fizzle.co/sparkline/vanity-vs-actionable-metrics

import pandas as pd
import numpy as np
import uuid
from datetime import datetime
from dateutil.relativedelta import relativedelta
# set the settingWithCopyWarning in Pandas to None
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

### 1. Defining the parameters

In [14]:
# number of devices to generate data for
num_device_uuids = 1000

# number of months for each device
num_months = 20

# starting month of product usage data
start_month = '2016-01-01'

### 2. Generating device uuids

In [54]:
# generating unique identifiers for each device
uuids = pd.Series([str(uuid.uuid4()) for i in range(1,num_device_uuids)])
user_data = pd.DataFrame()
user_data['device_uuid'] = pd.Series(uuids).repeat(num_months)
user_data =user_data.reset_index().drop('index', 1)

# example of two different device uuids
user_data[19:21]

Unnamed: 0,device_uuid
19,d98f302e-e358-4995-b76e-596ceeccde9f
20,cb98d2d2-9544-4d1a-a339-0c1406da8ef3


### 2. Generate date data

In [22]:
# defining starting and ending month for generating data
start_month_ts = pd.to_datetime(start_month)
end_month_ts = start_month_ts + relativedelta(months=+num_months - 1)
str(start_month_ts), str(end_month_ts)

('2016-01-01 00:00:00', '2017-08-01 00:00:00')

In [24]:
# months for which we will be generating data
months = pd.Series(pd.date_range(start_month_ts, end_month_ts, freq='MS'))
months

0    2016-01-01
1    2016-02-01
2    2016-03-01
3    2016-04-01
4    2016-05-01
5    2016-06-01
6    2016-07-01
7    2016-08-01
8    2016-09-01
9    2016-10-01
10   2016-11-01
11   2016-12-01
12   2017-01-01
13   2017-02-01
14   2017-03-01
15   2017-04-01
16   2017-05-01
17   2017-06-01
18   2017-07-01
19   2017-08-01
dtype: datetime64[ns]

In [26]:
# adding date column to the user data dataframe
user_data['date'] = pd.concat([months] * num_device_uuids, axis=0).reset_index().drop('index', 1)
user_data.head()

Unnamed: 0,device_uuid,date
0,0fc75756-7021-4964-a3a3-81ad1219afe0,2016-01-01
1,0fc75756-7021-4964-a3a3-81ad1219afe0,2016-02-01
2,0fc75756-7021-4964-a3a3-81ad1219afe0,2016-03-01
3,0fc75756-7021-4964-a3a3-81ad1219afe0,2016-04-01
4,0fc75756-7021-4964-a3a3-81ad1219afe0,2016-05-01


### 3. Generate usage features

In [30]:
num_usage_features = 3
feature_usage_ratio = 0.8

# extracting all the unique device uuids
device_uuids = pd.Series(user_data['device_uuid'].unique())
user_data_with_usage = pd.DataFrame()
device_uuids[0:5]

0    0fc75756-7021-4964-a3a3-81ad1219afe0
1    5367e8c7-01d1-4622-ab53-37d7ff03b63e
2    432da6f5-8905-4bca-a899-265b299ad9e1
3    c8860548-e0c9-47d0-9e01-0fc6f8ea4f2f
4    83f99b7a-5cc1-4b94-b0f8-b868bcf909f6
dtype: object

### 4.1. Generate feature ratios

In [36]:
# generate random usage data for each feature and simulate falling usage per feature
# (each feature 80% of the previous one)
# example: F1 = 1, F2 = F1*0.8 = 0.8, F3 = F2*0.8 = 0.64

# extract the names for each feature based on the pre-set number of feature
features = ['feature' + str(i) for i in range(1,num_usage_features+1)]

usage_features_ratio = {}

# set the feature_ratio for the first feature (100%)
cur_feature = 1.0

# calculate the ratio for each feature
for feature in features:
    usage_features_ratio[feature] = float("{0:.1f}".format(cur_feature))
    cur_feature*= 0.8

usage_features_ratio

{'feature1': 1.0, 'feature2': 0.8, 'feature3': 0.6}

### 4.2. Assign cohort groups to devices

In [37]:
# set the number of cohorts to be half of number of months of data (20 months = 10 cohorts)
num_cohorts = int(num_months / 2) 
num_cohorts

10

In [38]:
# assign cohorts to users randomly (when did the user first used the app?)
cohorts = pd.DataFrame()
cohorts['device_uuid'] = device_uuids
cohorts['cohort_group'] = np.random.randint(low=0, high=num_cohorts, size=num_device_uuids-1)
cohorts.head()

Unnamed: 0,device_uuid,cohort_group
0,0fc75756-7021-4964-a3a3-81ad1219afe0,1
1,5367e8c7-01d1-4622-ab53-37d7ff03b63e,1
2,432da6f5-8905-4bca-a899-265b299ad9e1,2
3,c8860548-e0c9-47d0-9e01-0fc6f8ea4f2f,9
4,83f99b7a-5cc1-4b94-b0f8-b868bcf909f6,2


### 4.3. Creating usage features

In [None]:
# finding cohort groups for each device_uuid
user_data_with_cohort_groups = pd.DataFrame()

for device_uuid in device_uuids:
    
    # 1. slice user_data, create a device_uuid data for each user
    device_uuid_data = user_data[user_data['device_uuid'] == device_uuid]
    
    # 2. find cohort group of the device_uui and delete all unnecessary months
    device_uuid_cohort_group = int(cohorts[cohorts['device_uuid'] == device_uuid]['cohort_group'])
    device_uuid_data = device_uuid_data[device_uuid_cohort_group:]

    user_data_with_cohort_groups = user_data_with_cohort_groups.append(device_uuid_data)

In [None]:
len(user_data_with_cohort_groups) / len(device_uuids)
# len(user_data) / len(device_uuids)

In [None]:
# 3.1. - Generate random data for Feature1

user_data_with_cohort_groups = user_data_with_cohort_groups.reset_index().drop('index', axis=1)

user_data_with_cohort_groups[features[0]] = pd.Series((
        np.random.randint(low=0, high=14, 
                          size=len(user_data_with_cohort_groups))))

# 3.2. - Generate data for remaining features (Feature2, Feature3...)
for feature in features[1:]:
        user_data_with_cohort_groups[feature] = user_data_with_cohort_groups[features[0]] * usage_features_ratio[feature]
        
user_data_with_cohort_groups.head()

In [None]:
user_data_with_usage_features = pd.DataFrame()

# 4.- Churner stays a churner

for device_uuid in device_uuids:
    
    churner = False
            
    device_uuid_data = user_data_with_cohort_groups[user_data_with_cohort_groups['device_uuid'] == device_uuid]
    
    
    for index, row in device_uuid_data.iterrows():
        
        if row[features[0]] == 0:
            churner = True

        if churner == True:
            # if a churner, set the value of all usage features to 0
            device_uuid_data.loc[index, features[0:]] = 0
    
    # print(device_uuid_data)
            
    user_data_with_usage_features = user_data_with_usage_features.append(device_uuid_data)
    
# delete the months when with churned device_uuid months
user_data_with_usage_features = user_data_with_usage_features[user_data_with_usage_features[features[0]] != 0]

In [None]:
user_data_with_usage_features.head()

In [None]:
# average number of months of usage per device_uuid
len(user_data_with_usage_features) / len(device_uuids)

In [None]:
num_months

### 4. Generate segmentation features

#### function to generate random features
#### parameters: name of feature, list of all feature values

In [15]:
# segmentation features
num_segmentation_featues = 3
platforms = ['iOS', 'Android']
countries = ['NL', 'AU', 'FR']
user_registered = [True, False]

In [16]:
segmentation_features = {'platform': platforms,
                'user_registered': user_registered,
                'country': countries}

In [17]:
segmentation_features

{'country': ['NL', 'AU', 'FR'],
 'platform': ['iOS', 'Android'],
 'user_registered': [True, False]}

In [18]:
# user_data_with_segmentation_features = pd.DataFrame()

segmentation_features_df = pd.DataFrame()

weights_2 = [0.7, 0.3]
weights_3 = [0.6, 0.3, 0.1]

for device_uuid in device_uuids[0:device_uuids_limit]:
    
    device_uuid_feature_values_dict = {}
    
    for feature_name, feature_values in segmentation_features.items():

        # generate the random feature value
        # add code if you want to add features with > 3 values
        if len(feature_values) == 2:
            feature_weights = weights_2
        elif len(feature_values) == 3:
            feature_weights = weights_3
        
        # generate the feature value from assigned weights
        feature_value = np.random.choice(feature_values, p=feature_weights)
        
        
        # device_uuid_feature_values_list.append(feature_value)
        
        # make a dict of feature_name
        device_uuid_feature_values_dict[feature_name] = feature_value
        
        
        
    # make a df with segmentation features for the device_uuid
    device_segm_features = pd.DataFrame(list(device_uuid_feature_values_dict.items())).T
    device_segm_features.columns = device_segm_features.iloc[0]
    device_segm_features['device_uuid'] = device_uuid
    device_segm_features = device_segm_features.reindex(device_segm_features.index.drop(0))
    device_segm_features['device_uuid'] = device_uuid
    # device_segm_features
        
    # append to the main df
        
    segmentation_features_df = segmentation_features_df.append(device_segm_features)
    
    
# segmentation_features_df = segmentation_features_df.reset_index().drop('index', axis=1)
segmentation_features_df = segmentation_features_df.set_index('device_uuid').reset_index()

In [19]:
segmentation_features_df.head()

Unnamed: 0,device_uuid,user_registered,country,platform
0,f20ba118-4654-4405-93de-dd281e3c8adc,True,AU,iOS
1,83c1793e-dd4a-4e6e-95b7-1ee6077d2d50,True,NL,Android
2,a0b3bf85-1494-45b8-8049-2aab8808321b,True,AU,Android
3,57f237d3-1119-40ee-83c8-07a4ce8db26e,True,NL,iOS
4,abac4bfc-f49e-452f-aee6-f531a9fcb470,True,AU,iOS


In [20]:
final_df = pd.DataFrame()

for device_uuid in device_uuids[0:device_uuids_limit]:
    
    # extract device data from the main df
    device_uuid_df = user_data_with_usage_features[user_data_with_usage_features['device_uuid'] == device_uuid]
    # extract device data for segmentation features data
    device_seg_feature_values_df = segmentation_features_df[segmentation_features_df['device_uuid'] == device_uuid]

    
    # print(device_seg_feature_values_df[0:10])
    
    # assign the values of segmentation features to the main df
    for seg_feature_name in device_seg_feature_values_df.columns[1:]:
        # device_uuid_data[seg_feature_name] = device_seg_feature_values_df[seg_feature_name]
        device_uuid_df[seg_feature_name] = device_seg_feature_values_df[seg_feature_name][int(device_seg_feature_values_df[seg_feature_name].index.values)]
        # None
        # print(device_seg_feature_values_df[seg_feature_name][int(device_seg_feature_values_df[seg_feature_name].index.values)])
        # [device_seg_feature_values_df[seg_feature_name].index.values])

    # append the device data to the main df
    final_df = final_df.append(device_uuid_df)

final_df.reset_index(inplace=True, drop=True)

NameError: name 'user_data_with_usage_features' is not defined

In [None]:
len(final_df)

In [None]:
final_df.groupby(['country']).count()['device_uuid'].plot(kind='hist');

In [None]:
final_df.groupby(['user_registered']).mean()

In [None]:
final_df.head()

In [None]:
len(final_df['device_uuid'].unique())

In [None]:
final_df.to_csv('generating_user_behavioral_data.csv')