The purpose of this file is to randomly assign each customer from the customer_churn_business_dataset.csv to an account_id from the ravenstack_accounts.csv.

We will follow the constraint of seats for each account.
For example, 

account_id = 1
seats = 3

All 3 seats will be randomly filled by a customer_id from the customer table.

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os

# Create generator object with seed = 3219087321 for reproducibility
np.random.default_rng(3219087321)

Generator(PCG64) at 0x1ECFF3FC200

In [None]:
## Read the file

os.chdir('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Glacier-Product-Release-Impact-Analysis\\raw_data')

try:
    accts = pd.read_csv("ravenstack_accounts.csv")
except FileNotFoundError:
    raise ValueError(f"Accounts csv not found")

try:
    users = pd.read_csv("customer_churn_business_dataset.csv")
except FileNotFoundError:
    raise ValueError(f"Users csv not found")


### Validation

<!-- Ensure that our inputs and assumptions are correct so that the code can behave correctly -->



In [5]:
users.columns

Index(['customer_id', 'gender', 'age', 'country', 'city', 'customer_segment',
       'tenure_months', 'signup_channel', 'contract_type', 'monthly_logins',
       'weekly_active_days', 'avg_session_time', 'features_used',
       'usage_growth_rate', 'last_login_days_ago', 'monthly_fee',
       'total_revenue', 'payment_method', 'payment_failures',
       'discount_applied', 'price_increase_last_3m', 'support_tickets',
       'avg_resolution_time', 'complaint_type', 'csat_score', 'escalations',
       'email_open_rate', 'marketing_click_rate', 'nps_score',
       'survey_response', 'referral_count', 'churn'],
      dtype='object')

In [7]:
## Validate the accts table

# Set A - Set B -> What is in Set A, not in B. 
req_cols = {'account_id', 'seats'}
missing = req_cols - set(accts.columns)


# If missing is not empty, then we are missing those columns. 
if missing:
    raise ValueError(f"Accounts table missing: {missing}")


## Validate the users table
if "customer_id" not in users.columns:
    raise ValueError("Users table missing customer_id")


In [8]:
accts.columns

Index(['account_id', 'account_name', 'industry', 'country', 'signup_date',
       'referral_source', 'plan_tier', 'seats', 'is_trial', 'churn_flag'],
      dtype='object')

In [None]:
## Ensure the seats column is an integer
accts['seats'] = pd.to_numeric(accts['seats'], errors="raise")


# If any seats are negative, raise error and return df with seats and acct_id
if (accts['seats'] < 0).any(): 
    neg_seats = df.loc[accts['seats'] < 0, ['account_id', 'seats']]
    raise ValueError(f"{neg_seats} has negative seats")

# Check for non-integer seats
if (accts['seats'] % 1 != 0).any():
    float_seats = df.loc[accts['seats'] % 1 != 0, ['account_id', 'seats']]
    raise ValueError(f"{float_seats} has non-integer seats")

# Check to see if # seats > # users
num_seats = int(accts['seats'].sum())
num_users = len(users['customer_id'])

seat_diff = num_seats - num_users


if num_seats > num_users:
    print(f"There are {seat_diff} more seats than users")
elif num_seats < num_users:
    print(f"There are {seat_diff * -1} less seats than users")
else:
    print(f"There are equal number of seats and number of users")



10280