### The purpose of this file is to clean all the .csv files from the raw_data folder that have already been modified by the messy package, then output to the Dataset file as our final dataset I will analyze

In [30]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import string
import re

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = base_dir / "raw_data" # raw_data contains kaggle data with data quality defects from messy package
output_dir = base_dir / "Dataset" # Dataset contains cleaned data

def clean_cols(df, date_cols: str | list[str], punct_keep: str | list[str] | None=None) -> pd.DataFrame:

    # Exclude punctuation from being removed
    if punct_keep is not None:
        punctuation = "".join(set(string.punctuation) - set(punct_keep))   
    else:
        punctuation = string.punctuation

    # If only 1 column is passed
    if isinstance(date_cols, str):
        df[f"{date_cols}"] = df[f"{date_cols}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )

        return df[f"{date_cols}"]  
    
    # If multiple columns are passed
    elif isinstance(date_cols, list):
        for col in date_cols:

            df[f"{col}"] = df[f"{col}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )

            df[f"{col}"] = df[f"{col}"].str.strip()

        return df

    else:
        raise TypeError("date_cols is neither a string nor list of strings")
        return df
    
# os.getcwd()

In [31]:
pd.set_option("display.max_rows", 10)

### Clean accounts.csv

In [32]:
os.chdir(data_dir)

df = pd.read_csv("accounts.csv")

df = clean_cols(df, ["account_id", 
                     "account_name", 
                     "industry", 
                     "country", 
                     "referral_source", 
                     "plan_tier"
                     ])

df['account_id'] = df['account_id'].str[0].str.upper() + "-" + df['account_id'].str[1:].str.lower()

df['account_name'] = df['account_name'].str.lower() \
                                        .str.title() \
                                        .str.strip()
df['industry'] = df['industry'].str.lower() \
                                .str.title() \
                                .str.strip()
df['country'] = df['country'].str.upper()
df['referral_source'] = df['referral_source'].str.lower()
df['plan_tier'] = df['plan_tier'].str.lower().str.title()

df['signup_date'] = df['signup_date'].str.strip()

# df[~df['signup_date'].str.match(r"^\d{4}-\d{2}-\d{2}$", na=True)]

df['seats'] = df['seats'].astype('Int64')
df['is_trial'] = df['is_trial'].astype('bool')
df['churn_flag'] = df['churn_flag'].astype('bool')

df.to_parquet(output_dir / "accounts.parquet")


### Clean churn_events

In [33]:
df = pd.read_csv("churn_events.csv")

df = clean_cols(df, ["churn_event_id",
                     "account_id", 
                     "reason_code", 
                     "feedback_text"
                     ])

df['churn_event_id'] = df['churn_event_id'].str[0].str.upper() + "-" + df['churn_event_id'].str[1:].str.lower()
df['account_id'] = df['account_id'].str[0].str.upper() + "-" + df['account_id'].str[1:].str.lower()
df['reason_code'] = df['reason_code'].str.lower()
df['feedback_text'] = df['feedback_text'].str.lower()

df['refund_amount_usd'] = df['refund_amount_usd'].astype('float')
df['preceding_upgrade_flag'] = df['preceding_upgrade_flag'].astype('bool')
df['preceding_downgrade_flag'] = df['preceding_downgrade_flag'].astype('bool')
df['is_reactivation'] = df['is_reactivation'].astype('bool')

# Check churn_date anomalies
df[~df['churn_date'].str.match(r"^\d{4}-\d{2}-\d{2}", na=True)]

df.to_parquet(output_dir / "churn_events.parquet")

### Clean feature_usage

In [34]:
df = pd.read_csv("feature_usage.csv")

df = clean_cols(df, ["usage_id",
                     "subscription_id",
                     "feature_name",
                     'usage_pk'
                     ])

df['usage_id'] = df['usage_id'].str[0].str.upper() + "-" + df['usage_id'].str[1:].str.lower()
df['subscription_id'] = df['subscription_id'].str[0].str.upper() + "-" + df['subscription_id'].str[1:].str.lower()

df['feature_name']= df['feature_name'].str.lower() \
                                        .str.strip() \
                                        .str.replace(r"^feature", "feature_", regex=True)

df['usage_count'] = df['usage_count'].astype('Int64')
df['usage_duration_secs'] = df['usage_duration_secs'].astype('Int64')
df['error_count'] = df['error_count'].astype('Int64')
df['is_beta_feature'] = df['is_beta_feature'].astype('bool')

# df[~df['usage_date'].str.match(r"^\d{4}-\d{2}-\d{2}", na=True)]

df['usage_pk'] = df['usage_pk'].astype(str).str.strip().str.extract(r'^(id)([A-Za-z]+)$') \
                  .apply(lambda x: f"{x[0]}_{x[1]}" if x.notna().all() else None, axis=1)

df.to_parquet(output_dir / "feature_usage.parquet")

### Clean subscriptions

In [35]:
df = pd.read_csv("subscriptions.csv")

df = clean_cols(df, ["subscription_id", "account_id", "plan_tier", "billing_frequency"])

df['subscription_id'] = df['subscription_id'].str[0].str.upper() + "-" + df['subscription_id'].str[1:].str.lower()
df['account_id'] = df['account_id'].str[0].str.upper() + "-" + df['account_id'].str[1:].str.lower()

df['plan_tier'] = df['plan_tier'].str.title().str.strip()
df['billing_frequency'] = df['billing_frequency'].str.title().str.strip()

df['seats'] = df['seats'].astype('Int64')
df['mrr_amount'] = df['mrr_amount'].astype('Int64')
df['arr_amount'] = df['arr_amount'].astype('Int64')

df['is_trial'] = df['is_trial'].astype('bool')
df['upgrade_flag'] = df['upgrade_flag'].astype('bool')
df['downgrade_flag'] = df['downgrade_flag'].astype('bool')
df['churn_flag'] = df['churn_flag'].astype('bool')
df['auto_renew_flag'] = df['auto_renew_flag'].astype('bool')

df['end_date'] = df['end_date'].str.upper().str.strip().replace("NA", np.nan)

# df[~df['start_date'].str.match(r"^\d{4}-\d{2}-\d{2}", na=True)]
# df[~df['end_date'].str.match(r"^\d{4}-\d{2}-\d{2}", na=True)]

df.to_parquet(output_dir / "subscriptions.parquet")

### Clean support_tickets

In [36]:
df = pd.read_csv("support_tickets.csv")

df = clean_cols(df, ["ticket_id", "account_id", "priority"])

df['ticket_id'] = df['ticket_id'].str[0].str.upper() + "-" + df['ticket_id'].str[1:].str.lower()
df['account_id'] = df['account_id'].str[0].str.upper() + "-" + df['account_id'].str[1:].str.lower()

df['priority'] = df['priority'].str.lower()

df['resolution_time_hours'] = df['resolution_time_hours'].astype('Int64')
df['first_response_time_minutes'] = df['first_response_time_minutes'].astype('Int64')


df['satisfaction_score'] = df['satisfaction_score'].str.strip().str.upper().replace("NA", np.nan).astype('Int64')

df['escalation_flag'] = df['escalation_flag'].astype('bool')

# df[~df['submitted_at'].str.match(r"^\d{4}-\d{2}-\d{2}", na=True)]
# df[~df['closed_at'].str.match(r"^\d{4}-\d{2}-\d{2}", na=True)]
df.to_parquet(output_dir / "support_tickets.parquet")