The purpose of this file is to randomly assign each customer from the customer_churn_business_dataset.csv to an account_id from the ravenstack_accounts.csv.

We will follow the constraint of seats for each account.
For example, 

account_id = 1
seats = 3

All 3 seats will be randomly filled by a customer_id from the customer table.

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os
import badgers # For generating realistic data quality issues from our Kaggle Dataset
from badgers.generators.tabular_data.missingness import MissingCompletelyAtRandom
from badgers.generators.tabular_data.outliers.distribution_sampling import *
from pathlib import Path
import yaml


# Create generator object with seed = 3219087321 for reproducibility
rng = np.random.default_rng(3219087321)

os.chdir('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\MessGenerator')
import MessGenerator 

In [1]:
def MessUpData(X, rng, exclude_cols: list[str], n_outliers):
    
    MessGeneratorObj = MessGenerator.MessUpGenerator(random_generator = rng)

    # Generate duplicate rows
    X = MessGeneratorObj.generate_dupes(X)

    # Generate Inconsistent Formatting
    X = MessGeneratorObj.generate_incon_format(X, exclude_cols=exclude_cols)

    # Generate Missing Vals
    X = MessGeneratorObj.generate_missing_vals(X, exclude_cols=exclude_cols)

    # Generate outliers
    X = MessGeneratorObj.generate_outliers(X, n_outliers=n_outliers)

    return X

## Randomly assign acct_id to each customer_id

In [5]:
## Read the file


os.chdir('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Glacier-Product-Release-Impact-Analysis\\raw_data')
try:
    accts = pd.read_csv("accounts.csv")
except FileNotFoundError:
    raise ValueError(f"Accounts csv not found")

try:
    customers = pd.read_csv("customers.csv")
except FileNotFoundError:
    raise ValueError(f"Users csv not found")


In [6]:
# Create array with acct_id * # seats
# Ex: acct_id = A, seat = 3 -> [A,A,A]; 3 'A' seats to assign

rng_accts = np.repeat(accts['account_id'].to_numpy(), 
                      accts['seats'].to_numpy()
)
# Shuffle all the acct_ids in-place
rng.shuffle(rng_accts)


num_cust = len(customers)

# If there are more seats than users, restrict the # of ids that will be assigned to be equal to num_cust
assign_ids = rng_accts[:num_cust]
customers["account_id"] = assign_ids

customers["account_id"]

0       A-bd4708
1       A-068fc6
2       A-4f18f0
3       A-ac85cd
4       A-66224b
          ...   
9995    A-1b707d
9996    A-1619f8
9997    A-b48f73
9998    A-2e3bad
9999    A-7f8241
Name: account_id, Length: 10000, dtype: object

### Introduce Data Quality Defects 

In [None]:
os.chdir(Path("..")) # Exit raw_data

In [10]:
config_path = Path("config.yaml")

with config_path.open("r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

In [None]:
output_dir = Path(cfg.get('Output_dir', "Dataset"))
output_dir.mkdir(parents=True, exist_ok=True)

WindowsPath('Dataset')

In [None]:
for table in cfg.get("datasets").keys():
    print(cfg.get(f'{table}'))

customers
None
accounts
None
churn_events
None
feat_usage
None
subs
None
sup_tix
None


In [None]:
churn_events = pd.read_csv(Path(".") / "ravenstack_churn_events.csv")

In [None]:
STOP

In [None]:
customers = MessUpData(customers, rng, exclude_cols=["customer_id"], n_outliers=17)

accts = MessUpData(accts, rng, exclude_cols=["account_id", "account_name", "seats"], n_outliers=34)



churn_events = MessUpData(churn_events, rng, exclude_cols=["account_id"], n_outliers=13)

feat_usage = pd.read_csv(Path(".") / "ravenstack_feature_usage.csv")

feat_usage = MessUpData(feat_usage, rng, exclude_cols=[], n_outliers=13)

subs = pd.read_csv(Path(".") / "ravenstack_subscriptions.csv")

subs = MessUpData(subs, rng, exclude_cols=["subscription_id", "account_id"], n_outliers=13)

sup_tix = pd.read_csv(Path(".") / "ravenstack_support_tickets.csv")

sup_tix = MessUpData(sup_tix, rng, exclude_cols=["account_id"], n_outliers=13)

## Validation 

### Introduce Data Quality Defects 

In [None]:
## Validate the accts table

# Set A - Set B -> What is in Set A, not in B. 
req_cols = {'account_id', 'seats'}
missing = req_cols - set(accts.columns)

# If missing is not empty, then we are missing those columns. 
if missing:
    raise ValueError(f"Accounts table missing: {missing}")

## Validate the users table
if "customer_id" not in customers.columns:
    raise ValueError("Users table missing customer_id")

In [None]:
## Ensure the seats column is an integer
accts['seats'] = pd.to_numeric(accts['seats'], errors="raise")


# If any seats are negative, raise error and return df with seats and acct_id
if (accts['seats'] < 0).any(): 
    neg_seats = accts.loc[accts['seats'] < 0, ['account_id', 'seats']]
    raise ValueError(f"{neg_seats} has negative seats")

# Check for non-integer seats
if (accts['seats'].notna() & (accts['seats'] % 1 != 0 )).any():
    float_seats = accts.loc[accts['seats'] % 1 != 0, ['account_id', 'seats']]
    raise ValueError(f"{float_seats} has non-integer seats")

# Check to see if # seats > # users
num_seats = int(accts['seats'].sum())
num_cust = len(customers['customer_id'])

seat_diff = num_seats - num_cust

# Seats > num_cust is allowed since we can have unassigned seats
# Seats < num_cust is not allowed since each customer must have a company assigned
if num_seats > num_cust:
    print(f"There are {seat_diff} more seats than users")
elif num_seats < num_cust:
    print(f"There are {seat_diff * -1} less seats than users")
    raise ValueError("There are not enough seats")
else:
    print(f"There are equal number of seats and number of users")

There are 9233 less seats than users


ValueError: There are not enough seats

In [None]:
Path("../Data").mkdir(exist_ok=True)

os.chdir(Path("../Data"))

customers.to_csv(Path(".") / "customers.csv")
accts.to_csv(Path(".") / "accts.csv")
churn_events.to_csv(Path(".") / "churn_events.csv")
feat_usage.to_csv(Path(".") / "feat_usage.csv")
subs.to_csv(Path(".") / "subs.csv")
sup_tix.to_csv(Path(".") / "sup_tix.csv")