The purpose of this file is to randomly assign each customer from the customer_churn_business_dataset.csv to an account_id from the ravenstack_accounts.csv.

We will follow the constraint of seats for each account.
For example, 

account_id = 1
seats = 3

All 3 seats will be randomly filled by a customer_id from the customer table.

In [38]:
# Import libraries

import pandas as pd
import numpy as np
import os

# Create generator object with seed = 3219087321 for reproducibility
rng = np.random.default_rng(3219087321)

In [39]:
## Read the file

os.chdir('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Glacier-Product-Release-Impact-Analysis\\raw_data')

try:
    accts = pd.read_csv("ravenstack_accounts.csv")
except FileNotFoundError:
    raise ValueError(f"Accounts csv not found")

try:
    customers = pd.read_csv("customer_churn_business_dataset.csv")
except FileNotFoundError:
    raise ValueError(f"Users csv not found")


### Validation

<!-- Ensure that our inputs and assumptions are correct so that the code can behave correctly -->



In [40]:
## Validate the accts table

# Set A - Set B -> What is in Set A, not in B. 
req_cols = {'account_id', 'seats'}
missing = req_cols - set(accts.columns)


# If missing is not empty, then we are missing those columns. 
if missing:
    raise ValueError(f"Accounts table missing: {missing}")


## Validate the users table
if "customer_id" not in customers.columns:
    raise ValueError("Users table missing customer_id")


In [42]:
## Ensure the seats column is an integer
accts['seats'] = pd.to_numeric(accts['seats'], errors="raise")


# If any seats are negative, raise error and return df with seats and acct_id
if (accts['seats'] < 0).any(): 
    neg_seats = accts.loc[accts['seats'] < 0, ['account_id', 'seats']]
    raise ValueError(f"{neg_seats} has negative seats")

# Check for non-integer seats
if (accts['seats'] % 1 != 0).any():
    float_seats = accts.loc[accts['seats'] % 1 != 0, ['account_id', 'seats']]
    raise ValueError(f"{float_seats} has non-integer seats")

# Check to see if # seats > # users
num_seats = int(accts['seats'].sum())
num_cust = len(customers['customer_id'])

seat_diff = num_seats - num_cust

# Seats > num_cust is allowed since we can have unassigned seats
# Seats < num_cust is not allowed since each customer must have a company assigned
if num_seats > num_cust:
    print(f"There are {seat_diff} more seats than users")
elif num_seats < num_cust:
    print(f"There are {seat_diff * -1} less seats than users")
    raise ValueError("There are not enough seats")
else:
    print(f"There are equal number of seats and number of users")

There are 280 more seats than users


## Randomly assign acct_id to each customer_id

In [46]:
# Create array with acct_id * # seats
# Ex: acct_id = A, seat = 3 -> [A,A,A]; 3 'A' seats to assign

rng_accts = np.repeat(accts['account_id'].to_numpy(), 
                      accts['seats'].to_numpy()
)

# Shuffle all the acct_ids in-place
rng.shuffle(rng_accts)

# If there are more seats than users, restrict the # of ids that will be assigned to be equal to num_cust
assign_ids = rng_accts[:num_cust]
customers["account_id"] = assign_ids

customers["account_id"]


0       A-f25509
1       A-b9eed8
2       A-ac85cd
3       A-1f7acb
4       A-3793ee
          ...   
9995    A-716841
9996    A-7f8241
9997    A-417d2f
9998    A-0354fe
9999    A-f6b2fb
Name: account_id, Length: 10000, dtype: object