In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import string
import re

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = base_dir / "raw_data" # raw_data contains kaggle data with data quality defects from messy package
output_dir = base_dir / "Dataset" # Dataset contains cleaned data

In [None]:
def clean_str_cols(df, exclude=[]):
    cut = df[exclude].copy()
    df = df.drop(columns=exclude)

    for column in df.select_dtypes(include=["object"]).columns:

        # Remove punctuation
        df[f"{column}"] = df[f"{column}"].str.replace(
            f"[{re.escape(string.punctuation)}]", 
            "",
            regex=True)
        
        df[f"{column}"] = df[f"{column}"].str.title() # Standardize Capitalization
        df[f"{column}"] = df[f"{column}"].str.strip() # Remove leading and trailing spaces 

    
    df = df.join(cut)

    return df

def clean_date_cols(df, date_cols: str | list[str], punct_keep: str | list[str] | None=None) -> pd.DataFrame:

    # Exclude punctuation from being removed
    if punct_keep is not None:
        punctuation = "".join(set(string.punctuation) - set(punct_keep))   
    else:
        punctuation = string.punctuation

    # If only 1 column is passed
    if isinstance(date_cols, str):
        df[f"{date_cols}"] = df[f"{date_cols}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )
    
    # If multiple columns are passed
    elif isinstance(date_cols, list):
        for col in date_cols:

            df[f"{col}"] = df[f"{col}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )

            df[f"{col}"] = df[f"{col}"].str.strip()

    else:
        raise TypeError("date_cols is neither a string nor list of strings")

    return df

In [None]:
pd.set_option("display.max_rows", None)

### Clean Accounts.csv

In [None]:
# os.chdir(data_dir) # Ensure we are in the raw_data dir

# df = pd.read_csv("accounts.csv") 

# df = clean_str_cols(df, exclude=["created_at"]) # Clean str columns

# # Change floating cols to
# df['account_id'] = df['account_id'].astype('Int64')
# df['mrr'] = df['mrr'].astype('Int64')

# # Save the cleaned accounts to Dataset dir
# os.chdir(output_dir)
# df.to_parquet(output_dir / "accounts.parquet", index=False)

# # Switch back to raw_data for the next table cleaning
# os.chdir(data_dir)

### Clean customers.csv

In [None]:
# df = pd.read_csv("customers.csv")

# df['customer_id'] = df['customer_id'].astype('Int64')

# df['signup_date'] = df['signup_date'].str.strip()

# df['monthly_fee'] = df['monthly_fee'].astype('Int64')

# df['acquisition_cost'] = df['acquisition_cost'].astype('Int64')

# df['churn_date'] = df['churn_date'].str.strip()
# df['churn_date'] = df['churn_date'].str.lower()

# df.to_parquet(output_dir / "customers.parquet", index=False)

### Clean events.csv

In [None]:
# os.chdir(data_dir)
# df = pd.read_csv("events.csv")

# df['event_id'] = df['event_id'].astype('Int64')
# df['user_id'] = df['user_id'].astype('Int64')


# df = df.drop(columns='...6')

# df = clean_str_cols(df, exclude=['occurred_at'])

# df['occurred_at'] = df['occurred_at'].str.strip()

# df.to_parquet(output_dir / "events.parquet", index=False)

In [None]:
for csv in data_dir.glob("*.csv"):
    print(csv.name)

### Clean experiments.csv

In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("experiments.csv")

# # df.head(5)

# # df.dtypes

# df['experiment_id'] = df['experiment_id'].astype("Int64")

# df = clean_str_cols(df, exclude=['start_date', 'end_date'])

# df = clean_date_cols(df, date_cols=['start_date', 'end_date'])

# df.to_parquet(output_dir / "experiments.parquet")


### Clean googleplaystore_user_reviews.csv


In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("googleplaystore_user_reviews.csv")

In [None]:
# # pd.options.display.float_format = "{:.2f}".format

# # df.head()

# # df.dtypes

# df = clean_str_cols(df)

# df['Sentiment'] = (df['Sentiment'].str.lower()).astype('str')

# df['Sentiment_Polarity'] = df['Sentiment_Polarity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [None]:
# df['Sentiment_Subjectivity'] = df['Sentiment_Subjectivity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [None]:
# df = df.drop(columns=['App', 'Translated_Review'])

# df.to_parquet(output_dir / "googleplaystore_user_reviews.parquet")

### Clean revenue.csv

In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("revenue.csv")

# df['customer_id'] = df['customer_id'].astype("Int64")
# df['monthly_fee'] = df['monthly_fee'].astype("Int64")

# df = clean_str_cols(df, exclude=['subscription_id', 'month'])

# df['revenue_type'] = df['revenue_type'].str.lower()

# # Format Month
# df = clean_date_cols(df, ['month'], "-")
# df['month'] = df['month'].replace(r"--+", "-", regex=True).str.lower()



# # Format subscription_id
# df = clean_date_cols(df, 'subscription_id')

# df['subscription_id'] = df['subscription_id'].str.extract(r"(S)(\d{4})(\d{6})") \
#                     .apply(lambda x: f"{x[0]}-{x[1]}-{x[2]}" if x.notna().all() else None, axis=1)

# df.to_parquet(output_dir / "revenue.parquet")
    

### Clean subscriptions.csv

In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("subscriptions.csv")

# df.head()

# df['subscription_id'] = (df['subscription_id'].str.upper()).str.replace(r"--+", "", regex=True)

# df['subscription_id'] = df['subscription_id'].str.extract(r"(S-\d{4}).*(\d{6})") \
#                     .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None, axis=1)

# df['customer_id'] = df['customer_id'].astype("Int64")

# df = clean_date_cols(df, 'month')

# df['month'] = df['month'].str.extract(r"(\d{4})(\d{2})") \
#                 .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None,axis=1)

# df['monthly_fee'] = df['monthly_fee'].astype("Int64")

# df.to_parquet(output_dir / "subscriptions.parquet")


### Clean users.csv

In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("users.csv")

# df['user_id'] = df['user_id'].astype("Int64")

# df['account_id'] = df['account_id'].astype("Int64")

# # Keep email messy. Its a bigger problem, but not necessary to fix currently

# # xd = df.copy()

# # xd['email'] = (df['email'].str.lower()).str.strip()
# # xd = clean_date_cols(xd, 'email', '.')

# # s = xd['email'].dropna()
# # s = s.str.replace(".",'', regex=False)

# # acct = s.str.extract(r'([a-z]+)(acct\d+io)')

# df = clean_date_cols(df, ['role', 'country'])

# df['role'] = df['role'].str.lower()
# df['country'] = df['country'].str.upper()

# df['is_active'] = df['country'].astype('bool')

# df.to_parquet(output_dir / "users.parquet")

### Clean variants.csv

In [111]:
for i in data_dir.glob("*.csv"):
    print(i.name)

accounts.csv
customers.csv
events.csv
experiments.csv
googleplaystore_user_reviews.csv
revenue.csv
subscriptions.csv
users.csv
variants.csv
variant_exposures.csv


In [None]:
# os.chdir(data_dir)

# df = pd.read_csv('variants.csv')

# df = clean_date_cols(df, ['variant_key', 'name', 'description', 'creative_type'])

# df['name'] = df['name'].str.title()
# df['variant_key'] = df['variant_key'].str.lower()
# df['creative_type'] = df['creative_type'].str.lower()

# df['variant_id'] = df['variant_id'].astype('Int64') 

# df['is_control'] = df['is_control'].astype('bool')

# df.to_parquet(output_dir / 'variants.parquet')

### Clean variant_exposures.csv

In [130]:
os.chdir(data_dir)

df = pd.read_csv('variant_exposures.csv')

# df.head()

df['exposure_id'] = df['exposure_id'].astype('Int64')
df['experiment_id'] = df['experiment_id'].astype('Int64')
df['variant_id'] = df['variant_id'].astype('Int64')
df['user_id'] = df['user_id'].astype('Int64')
df['clicks'] = df['clicks'].astype('Int64')
df['conversions'] = df['conversions'].astype('Int64')

df = df.drop(columns='...9')

df.to_parquet(output_dir / 'variant_exposures.parquet')