In [41]:
import helper_functions as hf
from pathlib import Path
import pandas as pd
import holoviews as hv
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from datetime import datetime, timedelta

clear_output()

In [3]:
zip_path = Path("data/relax_challenge.zip")

# use the functions to extract the zipfile
if hf.check_zipfile(zip_path):
    target_dir = zip_path.parent / zip_path.stem
    hf.create_target_directory(target_dir)
    hf.extract_zipfile(zip_path, target_dir)

Extracted data\relax_challenge.zip to data\relax_challenge


In [16]:
user_engagement_path = (
    "./data/relax_challenge/relax_challenge/takehome_user_engagement.csv"
)


users_path = "data/relax_challenge/relax_challenge/takehome_users.csv"


users_engagement_df = pd.read_csv(user_engagement_path)
display(users_engagement_df.sample(3))
users_engagement_df.info()
users_engagement_df.describe(include="all").T.fillna("")
# drop visited column
users_engagement_df.drop(columns="visited", inplace=True)

# convert time_stamp to datetime
users_engagement_df["time_stamp"] = pd.to_datetime(
    users_engagement_df["time_stamp"])

# get value counts of user_id
user_id_counts = users_engagement_df["user_id"].value_counts()
user_id_counts

Unnamed: 0,time_stamp,user_id,visited
79465,2014-05-09 01:14:21,4421,1
39977,2013-08-27 11:40:13,2447,1
15548,2013-01-13 19:15:33,912,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


user_id
3623     606
906      600
1811     593
7590     590
8068     585
        ... 
4699       1
4698       1
4697       1
4696       1
12000      1
Name: count, Length: 8823, dtype: int64

In [54]:
users_df = pd.read_csv(users_path, encoding="latin-1")
users_df.info()
display(users_df.sample(3))
users_df.describe(include="all").T.sort_values(by="unique").fillna("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
10111,10112,2014-01-26 22:41:30,Ackerman Susanne,SusanneAckerman@yahoo.com,GUEST_INVITE,,0,0,2,6328.0
1689,1690,2013-09-29 08:59:59,Krueger Stefanie,StefanieKrueger@yahoo.com,ORG_INVITE,1401872000.0,0,0,282,10163.0
2585,2586,2013-08-31 20:26:53,Bolton Poppy,PoppyBolton@yahoo.com,SIGNUP,1377981000.0,0,0,63,


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
creation_source,12000.0,5.0,ORG_INVITE,4254.0,,,,,,,
name,12000.0,11355.0,Araujo Gabriela,5.0,,,,,,,
email,12000.0,11980.0,AlfieLane@yahoo.com,2.0,,,,,,,
creation_time,12000.0,11996.0,2014-02-11 17:57:53,2.0,,,,,,,
object_id,12000.0,,,,6000.5,3464.24595,1.0,3000.75,6000.5,9000.25,12000.0
last_session_creation_time,8823.0,,,,1379279305.700442,19531160.787044,1338452406.0,1363194965.0,1382888470.0,1398442604.0,1402066730.0
opted_in_to_mailing_list,12000.0,,,,0.2495,0.432742,0.0,0.0,0.0,0.0,1.0
enabled_for_marketing_drip,12000.0,,,,0.149333,0.356432,0.0,0.0,0.0,0.0,1.0
org_id,12000.0,,,,141.884583,124.056723,0.0,29.0,108.0,238.25,416.0
invited_by_user_id,6417.0,,,,5962.957145,3383.761968,3.0,3058.0,5954.0,8817.0,11999.0


In [34]:
# "adopted user" as a user who has logged into the product on 3 separate days in at least 1 7-day period
# identify which factors predict future user adoption
user_id_counts.hvplot.hist()
user_id_counts[user_id_counts >= 3].hvplot.hist(
    title="Users Count with 3 or more logins"
)

# check if users with >= 3 logins have it in a 7 day period
users_more_than_3 = user_id_counts[user_id_counts >= 3].index

# create a new dataframe with only users with 3 or more logins
users_engagement_3 = users_engagement_df[
    users_engagement_df["user_id"].isin(users_more_than_3)
]
users_engagement_3

Unnamed: 0,time_stamp,user_id
1,2013-11-15 03:45:04,2
2,2013-11-29 03:45:04,2
3,2013-12-09 03:45:04,2
4,2013-12-25 03:45:04,2
5,2013-12-31 03:45:04,2
...,...,...
207905,2014-04-20 14:22:45,11991
207906,2014-04-25 14:22:45,11991
207907,2014-04-28 14:22:45,11991
207908,2014-05-17 14:22:45,11991


In [35]:
# sort by user_id and time_stamp
users_engagement_3.sort_values(by=["user_id", "time_stamp"], inplace=True)
users_engagement_3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_engagement_3.sort_values(by=["user_id", "time_stamp"], inplace=True)


Unnamed: 0,time_stamp,user_id
1,2013-11-15 03:45:04,2
2,2013-11-29 03:45:04,2
3,2013-12-09 03:45:04,2
4,2013-12-25 03:45:04,2
5,2013-12-31 03:45:04,2
...,...,...
207905,2014-04-20 14:22:45,11991
207906,2014-04-25 14:22:45,11991
207907,2014-04-28 14:22:45,11991
207908,2014-05-17 14:22:45,11991


In [47]:
# get the difference between the time_stamp of the current row and the 2 previous row grouped by user_id
users_engagement_3.loc[:, "time_diff"] = users_engagement_3.groupby("user_id")[
    "time_stamp"
].diff(2)

# get the user_id with time_diff less than 7 days
adopted_users = users_engagement_3.loc[users_engagement_3["time_diff"] < timedelta(8)][
    "user_id"
].unique()

# get when the adopted users became adopted users
adopted_user_df = (
    users_engagement_3.loc[users_engagement_3["time_diff"] < timedelta(8)]
    .groupby("user_id")
    .first()
)

adopted_user_df["date_became_adopted"] = adopted_user_df["time_stamp"]
adopted_user_df.drop(columns=["time_stamp", "time_diff"], inplace=True)
adopted_user_df

Unnamed: 0_level_0,date_became_adopted
user_id,Unnamed: 1_level_1
2,2014-02-09 03:45:04
10,2013-02-06 22:08:03
20,2014-03-13 11:46:38
33,2014-03-23 06:29:09
42,2012-12-25 19:05:07
...,...
11965,2014-05-02 07:17:35
11967,2014-03-10 08:12:37
11969,2013-06-03 00:48:14
11975,2013-05-29 11:10:11


In [None]:
# get the users with time_diff <= 7 days for 3 consecutive logins

