The purpose of this file is to randomly assign each customer from the customer_churn_business_dataset.csv to an account_id from the ravenstack_accounts.csv.

We will follow the constraint of seats for each account.
For example, 

account_id = 1
seats = 3

All 3 seats will be randomly filled by a customer_id from the customer table.

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import badgers # For generating realistic data quality issues from our Kaggle Dataset
from badgers.generators.tabular_data.missingness import MissingCompletelyAtRandom
from badgers.generators.tabular_data.outliers.distribution_sampling import *
from pathlib import Path
import yaml


# Create generator object with seed = 3219087321 for reproducibility
rng = np.random.default_rng(3219087321)

base_dir = Path(os.getcwd())

os.chdir(Path(base_dir.parent / "MessGenerator"))
import MessGenerator 

In [2]:
def MessUpData(X, rng, exclude_cols: list[str], n_outliers):
    
    MessGeneratorObj = MessGenerator.MessUpGenerator(random_generator = rng)

    # Generate duplicate rows
    X = MessGeneratorObj.generate_dupes(X)

    # Generate Inconsistent Formatting
    X = MessGeneratorObj.generate_incon_format(X, exclude_cols=exclude_cols)

    # Generate Missing Vals
    X = MessGeneratorObj.generate_missing_vals(X, exclude_cols=exclude_cols)

    # Generate outliers
    X = MessGeneratorObj.generate_outliers(X, n_outliers=n_outliers)

    return X

## Randomly assign acct_id to each customer_id

In [3]:
# Create array with acct_id * # seats
# Ex: acct_id = A, seat = 3 -> [A,A,A]; 3 'A' seats to assign

## Read the raw data files
raw_data_dir = base_dir / "raw_data"
os.chdir(raw_data_dir)
try:
    accts = pd.read_csv("accounts.csv")
except FileNotFoundError:
    raise ValueError(f"Accounts csv not found")

try:
    customers = pd.read_csv("customers.csv")
except FileNotFoundError:
    raise ValueError(f"Users csv not found")


rng_accts = np.repeat(accts['account_id'].to_numpy(), 
                      accts['seats'].to_numpy()
)
# Shuffle all the acct_ids in-place
rng.shuffle(rng_accts)

num_cust = len(customers)

# If there are more seats than users, restrict the # of ids that will be assigned to be equal to num_cust
assign_ids = rng_accts[:num_cust]
customers["account_id"] = assign_ids

### Introduce Data Quality Defects 

In [4]:
# Load the config file

os.chdir(base_dir)
config_path = Path("config.yaml")

with config_path.open("r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

data_cfg = cfg['datasets']

In [5]:
output_dir = Path(base_dir / cfg.get('Output_dir', "Dataset"))
output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
os.chdir(raw_data_dir)

for path in Path(".").glob("*.csv"):

    # Load the Dataset
    if Path.cwd().name != "raw_data":
        os.chdir(raw_data_dir)
    
    dataset_name = path.stem
    df = pd.read_csv(path)
    params = data_cfg.get(dataset_name, {})

    if params == {}:
        raise KeyError (f"Cannot find table name from datasets config for {dataset_name}")
    
    # Mess up the Dataset
    df = MessUpData(df, rng, exclude_cols=params['exclude_cols'], n_outliers=params['n_outliers'])

    # Save to the Dataset folder
    dataset_dir = base_dir / "Dataset"

    if not dataset_dir.exists:
       dataset_dir.mkdir(parents=True) 

    os.chdir(dataset_dir)
    output_path = Path(output_dir / f"{dataset_name}.csv")
    


    df.to_csv(output_path, index=False)

    

## Validation 

### Introduce Data Quality Defects 

In [7]:
## Validate the accts table

# Set A - Set B -> What is in Set A, not in B. 
req_cols = {'account_id', 'seats'}
missing = req_cols - set(accts.columns)

# If missing is not empty, then we are missing those columns. 
if missing:
    raise ValueError(f"Accounts table missing: {missing}")

## Validate the users table
if "customer_id" not in customers.columns:
    raise ValueError("Users table missing customer_id")

In [8]:
## Ensure the seats column is an integer
accts['seats'] = pd.to_numeric(accts['seats'], errors="raise")


# If any seats are negative, raise error and return df with seats and acct_id
if (accts['seats'] < 0).any(): 
    neg_seats = accts.loc[accts['seats'] < 0, ['account_id', 'seats']]
    raise ValueError(f"{neg_seats} has negative seats")

# Check for non-integer seats
if (accts['seats'].notna() & (accts['seats'] % 1 != 0 )).any():
    float_seats = accts.loc[accts['seats'] % 1 != 0, ['account_id', 'seats']]
    raise ValueError(f"{float_seats} has non-integer seats")

# Check to see if # seats > # users
num_seats = int(accts['seats'].sum())
num_cust = len(customers['customer_id'])

seat_diff = num_seats - num_cust

# Seats > num_cust is allowed since we can have unassigned seats
# Seats < num_cust is not allowed since each customer must have a company assigned
if num_seats > num_cust:
    print(f"There are {seat_diff} more seats than users")
elif num_seats < num_cust:
    print(f"There are {seat_diff * -1} less seats than users")
    raise ValueError("There are not enough seats")
else:
    print(f"There are equal number of seats and number of users")

There are 280 more seats than users
