In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import string
import re

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = base_dir / "raw_data" # raw_data contains kaggle data with data quality defects from messy package
output_dir = base_dir / "Dataset" # Dataset contains cleaned data

In [2]:
def clean_str_cols(df, exclude=[]):
    cut = df[exclude].copy()
    df = df.drop(columns=exclude)

    for column in df.select_dtypes(include=["object"]).columns:

        # Remove punctuation
        df[f"{column}"] = df[f"{column}"].str.replace(
            f"[{re.escape(string.punctuation)}]", 
            "",
            regex=True)
        
        df[f"{column}"] = df[f"{column}"].str.title() # Standardize Capitalization
        df[f"{column}"] = df[f"{column}"].str.strip() # Remove leading and trailing spaces 

    
    df = df.join(cut)

    return df

def clean_date_cols(df, date_cols: str | list[str], punct_keep: str | list[str] | None=None) -> pd.DataFrame:

    # Exclude punctuation from being removed
    if punct_keep is not None:
        punctuation = "".join(set(string.punctuation) - set(punct_keep))   
    else:
        punctuation = string.punctuation

    # If only 1 column is passed
    if isinstance(date_cols, str):
        df[f"{date_cols}"] = df[f"{date_cols}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )
    
    # If multiple columns are passed
    elif isinstance(date_cols, list):
        for col in date_cols:

            df[f"{col}"] = df[f"{col}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )

            df[f"{col}"] = df[f"{col}"].str.strip()

    else:
        raise TypeError("date_cols is neither a string nor list of strings")

    return df

In [3]:
pd.set_option("display.max_rows", None)

### Clean Accounts.csv

In [4]:
# os.chdir(data_dir) # Ensure we are in the raw_data dir

# df = pd.read_csv("accounts.csv") 

# df = clean_str_cols(df, exclude=["created_at"]) # Clean str columns

# # Change floating cols to
# df['account_id'] = df['account_id'].astype('Int64')
# df['mrr'] = df['mrr'].astype('Int64')

# # Save the cleaned accounts to Dataset dir
# os.chdir(output_dir)
# df.to_parquet(output_dir / "accounts.parquet", index=False)

# # Switch back to raw_data for the next table cleaning
# os.chdir(data_dir)

### Clean customers.csv

In [5]:
# df = pd.read_csv("customers.csv")

# df['customer_id'] = df['customer_id'].astype('Int64')

# df['signup_date'] = df['signup_date'].str.strip()

# df['monthly_fee'] = df['monthly_fee'].astype('Int64')

# df['acquisition_cost'] = df['acquisition_cost'].astype('Int64')

# df['churn_date'] = df['churn_date'].str.strip()
# df['churn_date'] = df['churn_date'].str.lower()

# df.to_parquet(output_dir / "customers.parquet", index=False)

### Clean events.csv

In [6]:
os.chdir(data_dir)
events = pd.read_csv("events.csv")

events['event_id'] = events['event_id'].astype('Int64')
events['user_id'] = events['user_id'].astype('Int64')


events = events.drop(columns='...6')

events = clean_str_cols(events, exclude=['occurred_at'])

events['occurred_at'] = events['occurred_at'].str.strip()



### Clean experiments.csv

In [7]:
# os.chdir(data_dir)

# df = pd.read_csv("experiments.csv")

# # df.head(5)

# # df.dtypes

# df['experiment_id'] = df['experiment_id'].astype("Int64")

# df = clean_str_cols(df, exclude=['start_date', 'end_date'])

# df = clean_date_cols(df, date_cols=['start_date', 'end_date'])

# df.to_parquet(output_dir / "experiments.parquet")


### Clean googleplaystore_user_reviews.csv


In [8]:
# os.chdir(data_dir)

# df = pd.read_csv("googleplaystore_user_reviews.csv")

In [9]:
# # pd.options.display.float_format = "{:.2f}".format

# # df.head()

# # df.dtypes

# df = clean_str_cols(df)

# df['Sentiment'] = (df['Sentiment'].str.lower()).astype('str')

# df['Sentiment_Polarity'] = df['Sentiment_Polarity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [10]:
# df['Sentiment_Subjectivity'] = df['Sentiment_Subjectivity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [11]:
# df = df.drop(columns=['App', 'Translated_Review'])

# df.to_parquet(output_dir / "googleplaystore_user_reviews.parquet")

### Clean revenue.csv

In [12]:
os.chdir(data_dir)

revenue = pd.read_csv("revenue.csv")

revenue['customer_id'] = revenue['customer_id'].astype("Int64")
revenue['monthly_fee'] = revenue['monthly_fee'].astype("Int64")

revenue = clean_str_cols(revenue, exclude=['subscription_id', 'month'])

revenue['revenue_type'] = revenue['revenue_type'].str.lower()

# Format Month
revenue = clean_date_cols(revenue, ['month'], "-")
revenue['month'] = revenue['month'].replace(r"--+", "-", regex=True).str.lower()



# Format subscription_id
revenue = clean_date_cols(revenue, 'subscription_id')

revenue['subscription_id'] = revenue['subscription_id'].str.extract(r"(S)(\d{4})(\d{6})") \
                    .apply(lambda x: f"{x[0]}-{x[1]}-{x[2]}" if x.notna().all() else None, axis=1)

revenue['month'] = revenue['month'].str.replace("-","") \
           .str.extract("(\d{4})(\d{2})") \
           .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None, axis=1)




  .str.extract("(\d{4})(\d{2})") \


### Clean subscriptions.csv

In [13]:
os.chdir(data_dir)

subscriptions = pd.read_csv("subscriptions.csv")

subscriptions.head()

subscriptions['subscription_id'] = (subscriptions['subscription_id'].str.upper()).str.replace(r"--+", "", regex=True)

subscriptions['subscription_id'] = subscriptions['subscription_id'].str.extract(r"(S-\d{4}).*(\d{6})") \
                    .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None, axis=1)

subscriptions['customer_id'] = subscriptions['customer_id'].astype("Int64")

subscriptions = clean_date_cols(subscriptions, 'month')

subscriptions['month'] = subscriptions['month'].str.extract(r"(\d{4})(\d{2})") \
                .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None,axis=1)

subscriptions['monthly_fee'] = subscriptions['monthly_fee'].astype("Int64")




### Clean users.csv

In [14]:
# os.chdir(data_dir)

# df = pd.read_csv("users.csv")

# df['user_id'] = df['user_id'].astype("Int64")

# df['account_id'] = df['account_id'].astype("Int64")

# # Keep email messy. Its a bigger problem, but not necessary to fix currently

# # xd = df.copy()

# # xd['email'] = (df['email'].str.lower()).str.strip()
# # xd = clean_date_cols(xd, 'email', '.')

# # s = xd['email'].dropna()
# # s = s.str.replace(".",'', regex=False)

# # acct = s.str.extract(r'([a-z]+)(acct\d+io)')

# df = clean_date_cols(df, ['role', 'country'])

# df['role'] = df['role'].str.lower()
# df['country'] = df['country'].str.upper()

# df['is_active'] = df['country'].astype('bool')

# df.to_parquet(output_dir / "users.parquet")

### Clean variants.csv

In [15]:
# os.chdir(data_dir)

# df = pd.read_csv('variants.csv')

# df = clean_date_cols(df, ['variant_key', 'name', 'description', 'creative_type'])

# df['name'] = df['name'].str.title()
# df['variant_key'] = df['variant_key'].str.lower()
# df['creative_type'] = df['creative_type'].str.lower()

# df['variant_id'] = df['variant_id'].astype('Int64') 

# df['is_control'] = df['is_control'].astype('bool')

# df.to_parquet(output_dir / 'variants.parquet')

### Clean variant_exposures.csv

In [16]:
# os.chdir(data_dir)

# df = pd.read_csv('variant_exposures.csv')

# # df.head()

# df['exposure_id'] = df['exposure_id'].astype('Int64')
# df['experiment_id'] = df['experiment_id'].astype('Int64')
# df['variant_id'] = df['variant_id'].astype('Int64')
# df['user_id'] = df['user_id'].astype('Int64')
# df['clicks'] = df['clicks'].astype('Int64')
# df['conversions'] = df['conversions'].astype('Int64')

# df = df.drop(columns='...9')

# df.to_parquet(output_dir / 'variant_exposures.parquet')

### Change dates
For realistic purposes, we will randomly assign some rows to 2025-11

In [17]:
def force_some_rows_to_nov_2025(
    df: pd.DataFrame,
    date_col: str,
    pct: float = 0.05,
    seed: int = 42
) -> pd.DataFrame:
    """
    Randomly selects ~pct of rows and sets date_col to November 2025.
    Preserves day when possible (clamped to 30) and time-of-day for timestamps.
    """
    out = df.copy()

    # Ensure datetime
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")

    # Random mask
    rng = np.random.default_rng(seed)
    mask = rng.random(len(out)) < pct

    # Extract components safely
    day = out.loc[mask, date_col].dt.day.clip(upper=30)
    hour = out.loc[mask, date_col].dt.hour.fillna(0).astype(int)
    minute = out.loc[mask, date_col].dt.minute.fillna(0).astype(int)
    second = out.loc[mask, date_col].dt.second.fillna(0).astype(int)

    # Assign new dates
    out.loc[mask, date_col] = pd.to_datetime(
        {
            "year": 2025,
            "month": 11,
            "day": day,
            "hour": hour,
            "minute": minute,
            "second": second,
        },
        errors="coerce",
    )

    return out

# Apply to your tables
revenue = force_some_rows_to_nov_2025(
    revenue,
    date_col="month",
    pct=0.05,
    seed=1
)

events = force_some_rows_to_nov_2025(
    events,
    date_col="occurred_at",
    pct=0.05,
    seed=2
)

subscriptions = force_some_rows_to_nov_2025(
    subscriptions,
    date_col="month",
    pct=0.05,
    seed=3
)

# Save to parquet
revenue.to_parquet(output_dir / "revenue.parquet")
events.to_parquet(output_dir / "events.parquet", index=False)
subscriptions.to_parquet(output_dir / "subscriptions.parquet")


# Verify min/max dates

print("revenue:", revenue["month"].min(), revenue["month"].max())
print("events:", events["occurred_at"].min(), events["occurred_at"].max())
print("subscriptions:", subscriptions["month"].min(), subscriptions["month"].max())


revenue: 2024-01-01 00:00:00 2025-11-01 00:00:00
events: 2025-05-20 18:51:40 2025-11-30 06:46:56
subscriptions: 2024-01-01 00:00:00 2025-11-01 00:00:00
