In [15]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import string
import re

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = base_dir / "raw_data" # raw_data contains kaggle data with data quality defects from messy package
output_dir = base_dir / "Dataset" # Dataset contains cleaned data

In [16]:
def clean_str_cols(df, exclude=[]):
    cut = df[exclude].copy()
    df = df.drop(columns=exclude)

    for column in df.select_dtypes(include=["object"]).columns:

        # Remove punctuation
        df[f"{column}"] = df[f"{column}"].str.replace(
            f"[{re.escape(string.punctuation)}]", 
            "",
            regex=True)
        
        df[f"{column}"] = df[f"{column}"].str.title() # Standardize Capitalization
        df[f"{column}"] = df[f"{column}"].str.strip() # Remove leading and trailing spaces 

    
    df = df.join(cut)

    return df

def clean_date_cols(df, date_cols: str | list[str], punct_keep: str | list[str] | None=None) -> pd.DataFrame:

    # Exclude punctuation from being removed
    if punct_keep is not None:
        punctuation = "".join(set(string.punctuation) - set(punct_keep))   
    else:
        punctuation = string.punctuation

    # If only 1 column is passed
    if isinstance(date_cols, str):
        df[f"{date_cols}"] = df[f"{date_cols}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )
    
    # If multiple columns are passed
    elif isinstance(date_cols, list):
        for col in date_cols:

            df[f"{col}"] = df[f"{col}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )

            df[f"{col}"] = df[f"{col}"].str.strip()

    else:
        raise TypeError("date_cols is neither a string nor list of strings")

    return df

In [17]:
pd.set_option("display.max_rows", None)

### Clean Accounts.csv

In [18]:
# os.chdir(data_dir) # Ensure we are in the raw_data dir

# df = pd.read_csv("accounts.csv") 

# df = clean_str_cols(df, exclude=["created_at"]) # Clean str columns

# # Change floating cols to
# df['account_id'] = df['account_id'].astype('Int64')
# df['mrr'] = df['mrr'].astype('Int64')

# # Save the cleaned accounts to Dataset dir
# os.chdir(output_dir)
# df.to_parquet(output_dir / "accounts.parquet", index=False)

# # Switch back to raw_data for the next table cleaning
# os.chdir(data_dir)

### Clean customers.csv

In [19]:
# df = pd.read_csv("customers.csv")

# df['customer_id'] = df['customer_id'].astype('Int64')

# df['signup_date'] = df['signup_date'].str.strip()

# df['monthly_fee'] = df['monthly_fee'].astype('Int64')

# df['acquisition_cost'] = df['acquisition_cost'].astype('Int64')

# df['churn_date'] = df['churn_date'].str.strip()
# df['churn_date'] = df['churn_date'].str.lower()

# df.to_parquet(output_dir / "customers.parquet", index=False)

### Clean events.csv

In [20]:
# os.chdir(data_dir)
# df = pd.read_csv("events.csv")

# df['event_id'] = df['event_id'].astype('Int64')
# df['user_id'] = df['user_id'].astype('Int64')


# df = df.drop(columns='...6')

# df = clean_str_cols(df, exclude=['occurred_at'])

# df['occurred_at'] = df['occurred_at'].str.strip()

# df.to_parquet(output_dir / "events.parquet", index=False)

In [21]:
for csv in data_dir.glob("*.csv"):
    print(csv.name)

accounts.csv
customers.csv
events.csv
experiments.csv
googleplaystore_user_reviews.csv
revenue.csv
subscriptions.csv
users.csv
variants.csv
variant_exposures.csv


### Clean experiments.csv

In [22]:
# os.chdir(data_dir)

# df = pd.read_csv("experiments.csv")

# # df.head(5)

# # df.dtypes

# df['experiment_id'] = df['experiment_id'].astype("Int64")

# df = clean_str_cols(df, exclude=['start_date', 'end_date'])

# df = clean_date_cols(df, date_cols=['start_date', 'end_date'])

# df.to_parquet(output_dir / "experiments.parquet")


### Clean googleplaystore_user_reviews.csv


In [23]:
# os.chdir(data_dir)

# df = pd.read_csv("googleplaystore_user_reviews.csv")

In [24]:
# # pd.options.display.float_format = "{:.2f}".format

# # df.head()

# # df.dtypes

# df = clean_str_cols(df)

# df['Sentiment'] = (df['Sentiment'].str.lower()).astype('str')

# df['Sentiment_Polarity'] = df['Sentiment_Polarity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [25]:
# df['Sentiment_Subjectivity'] = df['Sentiment_Subjectivity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [26]:
# df = df.drop(columns=['App', 'Translated_Review'])

# df.to_parquet(output_dir / "googleplaystore_user_reviews.parquet")

### Clean revenue.csv

In [27]:
# os.chdir(data_dir)

# df = pd.read_csv("revenue.csv")

# df['customer_id'] = df['customer_id'].astype("Int64")
# df['monthly_fee'] = df['monthly_fee'].astype("Int64")

# df = clean_str_cols(df, exclude=['subscription_id', 'month'])

# df['revenue_type'] = df['revenue_type'].str.lower()

# # Format Month
# df = clean_date_cols(df, ['month'], "-")
# df['month'] = df['month'].replace(r"--+", "-", regex=True).str.lower()



# # Format subscription_id
# df = clean_date_cols(df, 'subscription_id')

# df['subscription_id'] = df['subscription_id'].str.extract(r"(S)(\d{4})(\d{6})") \
#                     .apply(lambda x: f"{x[0]}-{x[1]}-{x[2]}" if x.notna().all() else None, axis=1)

# df.to_parquet(output_dir / "revenue.parquet")
    

### Clean subscriptions.csv

In [28]:
# os.chdir(data_dir)

# df = pd.read_csv("subscriptions.csv")

# df.head()

# df['subscription_id'] = (df['subscription_id'].str.upper()).str.replace(r"--+", "", regex=True)

# df['subscription_id'] = df['subscription_id'].str.extract(r"(S-\d{4}).*(\d{6})") \
#                     .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None, axis=1)

# df['customer_id'] = df['customer_id'].astype("Int64")

# df = clean_date_cols(df, 'month')

# df['month'] = df['month'].str.extract(r"(\d{4})(\d{2})") \
#                 .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None,axis=1)

# df['monthly_fee'] = df['monthly_fee'].astype("Int64")

# df.to_parquet(output_dir / "subscriptions.parquet")


### Clean users.csv

In [29]:
os.chdir(data_dir)

df = pd.read_csv("users.csv")


In [30]:
df['user_id'] = df['user_id'].astype("Int64")

df['account_id'] = df['account_id'].astype("Int64")

In [31]:
df.dtypes

user_id        Int64
account_id     Int64
email         object
role          object
country       object
created_at    object
is_active     object
dtype: object

In [32]:
df.head()

xd=clean_date_cols(df,'email', ["@", "."])

In [51]:
xd[['one','two','three','four']] = xd['email'].str.extract(r"^([a-zA-z]+)\.([a-zA-z]+)\.?@(.+)\.([a-zA-z]+)$")

In [52]:
xd

Unnamed: 0,user_id,account_id,email,role,country,created_at,is_active,one,two,three,four
0,1.0,1.0,alex.white@acct1.io,member,NL,,True,alex,white,acct1,io
1,2.0,2.0,skyler.wilson@acct2.io,,.MX,2025-11-26 14:43:54,True,skyler,wilson,acct2,io
2,3.0,3.0,peyton.miller@acct3.io,member,AR,2025-12-12 10:54:10,True,peyton,miller,acct3,io
3,4.0,4.0,jamie.moore@acc@t4.io,membe(r,CO,2025-12-11 13:54:30,True,jamie,moore,acc@t4,io
4,5.0,4.0,marley.walker@acct4.io,mem&ber,CA,2025-12-21 01:28:11,True,marley,walker,acct4,io
5,6.0,4.0,casey.perry@acct4.io,m@ember,J@P,2025-11-30 05:39:39,True,casey,perry,acct4,io
6,7.0,4.0,marley.perez@acct4.io,mem(be@r,U+S,2025-11-16 00:09:46,,marley,perez,acct4,io
7,,5.0,alex.harris@acct5.io,a#dmin,FR,2025-12-07 05:15:41,True,alex,harris,acct5,io
8,9.0,5.0,,,FI,2025-12-26 18:36:39,True,,,,
9,10.0,5.0,,&membe(r,US,2025-12-15 00:31:30,True,,,,


In [34]:
len(df)

708

In [35]:
for i in data_dir.glob("*.csv"):
    print(i.name)

accounts.csv
customers.csv
events.csv
experiments.csv
googleplaystore_user_reviews.csv
revenue.csv
subscriptions.csv
users.csv
variants.csv
variant_exposures.csv
