In [2]:
#Load and preprocess the data from both takehome_user_engagement.csv and takehome_users.csv files
import pandas as pd
import numpy as np
from datetime import datetime

# Try different encodings
encodings = ['utf-8', 'iso-8859-1', 'cp1252']

for encoding in encodings:
    try:
        users_df = pd.read_csv('takehome_users.csv', encoding=encoding, parse_dates=['creation_time', 'last_session_creation_time'])
        engagement_df = pd.read_csv('takehome_user_engagement.csv', encoding=encoding, parse_dates=['time_stamp'])
        print(f"Successfully read files with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to read with encoding: {encoding}")

# Continue with preprocessing if files were successfully read
if 'users_df' in locals() and 'engagement_df' in locals():
    # Preprocess users dataframe
    users_df['creation_time'] = pd.to_datetime(users_df['creation_time'])
    users_df['last_session_creation_time'] = pd.to_datetime(users_df['last_session_creation_time'])
    users_df['days_since_creation'] = (datetime.now() - users_df['creation_time']).dt.days
    users_df['days_since_last_session'] = (datetime.now() - users_df['last_session_creation_time']).dt.days

    # Preprocess engagement dataframe
    engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp'])

    # Display info about the dataframes
    print(users_df.info())
    print("\n")
    print(engagement_df.info())

    # Display the first few rows of each dataframe
    print("\nUsers DataFrame:")
    print(users_df.head())
    print("\nEngagement DataFrame:")
    print(engagement_df.head())
else:
    print("Failed to read CSV files with available encodings.")

Failed to read with encoding: utf-8
Successfully read files with encoding: iso-8859-1


  users_df = pd.read_csv('takehome_users.csv', encoding=encoding, parse_dates=['creation_time', 'last_session_creation_time'])
  users_df['last_session_creation_time'] = pd.to_datetime(users_df['last_session_creation_time'])


DateParseError: year 1398138810 is out of range: 1398138810, at position 0

In [3]:
def is_user_adopted(user_data):
    # Sort the user's engagement data by timestamp
    user_data = user_data.sort_values('time_stamp')
    
    # Convert timestamps to datetime if they aren't already
    user_data['time_stamp'] = pd.to_datetime(user_data['time_stamp'])
    
    # Calculate the difference in days between consecutive logins
    user_data['days_diff'] = user_data['time_stamp'].diff().dt.days
    
    # Create a rolling 7-day window
    rolling_window = user_data['days_diff'].rolling(window=7)
    
    # Count unique days in each 7-day window
    unique_days_in_window = rolling_window.apply(lambda x: x.nunique())
    
    # Check if any 7-day window has at least 3 unique days
    return (unique_days_in_window >= 3).any()

# Apply the function to the engagement data grouped by user
engagement_df['is_adopted'] = engagement_df.groupby('user_id').apply(is_user_adopted)

print(engagement_df.head())
print(engagement_df['is_adopted'].value_counts())

           time_stamp  user_id  visited is_adopted
0 2014-04-22 03:53:30        1        1        NaN
1 2013-11-15 03:45:04        2        1      False
2 2013-11-29 03:45:04        2        1       True
3 2013-12-09 03:45:04        2        1      False
4 2013-12-25 03:45:04        2        1      False
is_adopted
False    7260
True     1563
Name: count, dtype: int64


  engagement_df['is_adopted'] = engagement_df.groupby('user_id').apply(is_user_adopted)


This data gives a good starting point for further analysis.  From the above, it can be seen that around 17.7% of users in the dataset are considered adopted based on criteria.

In [4]:
# Merge the engagement data with user information
merged_df = pd.merge(engagement_df, users_df, left_on='user_id', right_on='object_id', how='left')

# Drop duplicate columns if any
merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]

# Display info about the merged dataframe
print(merged_df.info())

# Display the first few rows of the merged dataframe
print("\nMerged DataFrame:")
print(merged_df.head())

# Check the number of rows before and after merging
print(f"\nNumber of rows in engagement_df: {len(engagement_df)}")
print(f"Number of rows in users_df: {len(users_df)}")
print(f"Number of rows in merged_df: {len(merged_df)}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   time_stamp                  207917 non-null  datetime64[ns]
 1   user_id                     207917 non-null  int64         
 2   visited                     207917 non-null  int64         
 3   is_adopted                  8823 non-null    object        
 4   object_id                   207917 non-null  int64         
 5   creation_time               207917 non-null  datetime64[ns]
 6   name                        207917 non-null  object        
 7   email                       207917 non-null  object        
 8   creation_source             207917 non-null  object        
 9   last_session_creation_time  207917 non-null  object        
 10  opted_in_to_mailing_list    207917 non-null  int64         
 11  enabled_for_marketing_drip  207917 non-

In [5]:
merged_df['creation_time'] = pd.to_datetime(merged_df['creation_time'])
merged_df['account_age_days'] = (merged_df['time_stamp'] - merged_df['creation_time']).dt.days
merged_df['is_invited'] = merged_df['invited_by_user_id'].notna().astype(int)
merged_df['creation_source_encoded'] = pd.Categorical(merged_df['creation_source']).codes
merged_df['total_logins'] = merged_df.groupby('user_id')['visited'].transform('sum')
merged_df['avg_logins_per_day'] = merged_df['total_logins'] / (merged_df['account_age_days'] + 1)
merged_df['days_since_last_login'] = (merged_df.groupby('user_id')['time_stamp'].transform('max') - merged_df['time_stamp']).dt.days

# dummy variables for creation_source
creation_source_dummies = pd.get_dummies(merged_df['creation_source'], prefix='source')
merged_df = pd.concat([merged_df, creation_source_dummies], axis=1)
print(merged_df.info())

print("\nUpdated Merged DataFrame:")
print(merged_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 25 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   time_stamp                  207917 non-null  datetime64[ns]
 1   user_id                     207917 non-null  int64         
 2   visited                     207917 non-null  int64         
 3   is_adopted                  8823 non-null    object        
 4   object_id                   207917 non-null  int64         
 5   creation_time               207917 non-null  datetime64[ns]
 6   name                        207917 non-null  object        
 7   email                       207917 non-null  object        
 8   creation_source             207917 non-null  object        
 9   last_session_creation_time  207917 non-null  object        
 10  opted_in_to_mailing_list    207917 non-null  int64         
 11  enabled_for_marketing_drip  207917 non-

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import statsmodels.api as sm

# Prepare the data
features = ['account_age_days', 'is_invited', 'creation_source_encoded', 'total_logins', 
            'avg_logins_per_day', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip']
X = merged_df[features]
y = merged_df['is_adopted'].fillna(False).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Print feature importance
feature_importance = pd.DataFrame({'feature': features, 'importance': abs(model.coef_[0])})
print(feature_importance.sort_values('importance', ascending=False))

# Fit statsmodels logistic regression for p-values
X_train_sm = sm.add_constant(X_train)
sm_model = sm.Logit(y_train, X_train_sm)
sm_results = sm_model.fit()
print(sm_results.summary())

# Evaluate the model
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

  y = merged_df['is_adopted'].fillna(False).astype(int)


                      feature  importance
6  enabled_for_marketing_drip    0.202851
1                  is_invited    0.154085
3                total_logins    0.125513
2     creation_source_encoded    0.097639
0            account_age_days    0.096125
4          avg_logins_per_day    0.051466
5    opted_in_to_mailing_list    0.004420
Optimization terminated successfully.
         Current function value: 0.044559
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:             is_adopted   No. Observations:               166333
Model:                          Logit   Df Residuals:                   166325
Method:                           MLE   Df Model:                            7
Date:                Fri, 20 Sep 2024   Pseudo R-squ.:                 0.01052
Time:                        13:31:05   Log-Likelihood:                -7411.6
converged:                       True   LL-Null:                       -7490.4
Covarian

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Feature Importance) The most important feature is 'enabled_for_marketing_drip', followed by 'is_invited', and 'total_logins'.  'account_age_days' and 'creation_source_encoded' are also of moderate importance.  'avg_logins_per_day' and 'opted_in_to_mailing_list' are less important.  For statistical significance, 'enabled_for_marketing_drip' has a very low p-value (<0.0001), indicating strong statistical significance.  The model shows high accuracy (0.99) but performs poorly on predicting the positive class (adopted users). 

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np

features = ['enabled_for_marketing_drip', 'is_invited', 'total_logins', 
            'account_age_days', 'creation_source_encoded', 
            'avg_logins_per_day', 'opted_in_to_mailing_list']
X = merged_df[features]
y = merged_df['is_adopted'].fillna(False).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate class weights
class_weights = {0: 1, 1: y_train.value_counts()[0] / y_train.value_counts()[1]}

# Train Random Forest model with class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
rf_model.fit(X_train_scaled, y_train)

# Get feature importances
feature_importance = pd.DataFrame({'feature': features, 'importance': rf_model.feature_importances_})
print("Feature Importances:")
print(feature_importance.sort_values('importance', ascending=False))

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

  y = merged_df['is_adopted'].fillna(False).astype(int)


Feature Importances:
                      feature  importance
2                total_logins    0.484245
5          avg_logins_per_day    0.198543
3            account_age_days    0.189806
4     creation_source_encoded    0.064597
6    opted_in_to_mailing_list    0.028144
0  enabled_for_marketing_drip    0.017518
1                  is_invited    0.017146

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     41298
           1       0.02      0.03      0.03       286

    accuracy                           0.98     41584
   macro avg       0.51      0.51      0.51     41584
weighted avg       0.99      0.98      0.99     41584



'total_logins' is by far the most important feature, accounting for about 48% of the predictive power.  'avg_logins_per_day' and 'account_age_days' are the next most important features, each contributing around 19-20% to the model's decisions.  'creation_source_encoded' has moderate importance at about 6.5%.  And the remaining features ('opted_in_to_mailing_list', 'enabled_for_marketing_drip', and 'is_invited') have relatively lower importance, each contributing less than 3%.

The data analysis had an objective to identify factors that predict future user adoption, which is defined as users who log into the product on three separate days in at least one seven-day period.  The methodology used for the analysis involved data preprocessing and feature engineering; logistic regression for initial insights; and the use of a random forest classifier with class weighting to address imbalance.  

Key Findings were that user engagement metrics are the strongest predictors of adoption:  total logins (48.4%  importance), average logins per day (19.9% importance), and account age in days (19.0% importance); creation source has moderate predictive power (6.5% importance); and marketing-related features and invitation status have minimal impact on adoption prediction (each <3% importance).  The model performance had an overall accuracy of 98%, precision for adopted users 2%, and recall for adopted users 3%. 

Further analysis could be done to focus on login frequency and investigation of factors influencing total logins to develop strategies to encourage more frequent use.  Further analysis could look at the impact of different creation sources or even long-term adoption.  In conclusion, the analysis provides insight for improving user adoption rates by focusing on user behaviour patterns and key engagement metrics. 