In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import string
import re

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = base_dir / "raw_data" # raw_data contains kaggle data with data quality defects from messy package
output_dir = base_dir / "Dataset" # Dataset contains cleaned data

In [80]:
def clean_str_cols(df, exclude=[]):
    cut = df[exclude].copy()
    df = df.drop(columns=exclude)

    for column in df.select_dtypes(include=["object"]).columns:

        # Remove punctuation
        df[f"{column}"] = df[f"{column}"].str.replace(
            f"[{re.escape(string.punctuation)}]", 
            "",
            regex=True)
        
        df[f"{column}"] = df[f"{column}"].str.title() # Standardize Capitalization
        df[f"{column}"] = df[f"{column}"].str.strip() # Remove leading and trailing spaces 

    
    df = df.join(cut)

    return df

def clean_date_cols(df, date_cols: str | list[str], punct_keep: str | list[str] | None=None) -> pd.DataFrame:

    # Exclude punctuation from being removed
    if punct_keep is not None:
        punctuation = "".join(set(string.punctuation) - set(punct_keep))   
    else:
        punctuation = string.punctuation

    # If only 1 column is passed
    if isinstance(date_cols, str):
        df[f"{date_cols}"] = df[f"{date_cols}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )
    
    # If multiple columns are passed
    elif isinstance(date_cols, list):
        for col in date_cols:

            df[f"{col}"] = df[f"{col}"].str.replace(
                f"[{re.escape(punctuation)}]",
                "",
                regex=True
            )

            df[f"{col}"] = df[f"{col}"].str.strip()

    else:
        raise TypeError("date_cols is neither a string nor list of strings")

    return df

In [None]:
pd.set_option("display.max_rows", None)

### Clean Accounts.csv

In [None]:
# os.chdir(data_dir) # Ensure we are in the raw_data dir

# df = pd.read_csv("accounts.csv") 

# df = clean_str_cols(df, exclude=["created_at"]) # Clean str columns

# # Change floating cols to
# df['account_id'] = df['account_id'].astype('Int64')
# df['mrr'] = df['mrr'].astype('Int64')

# # Save the cleaned accounts to Dataset dir
# os.chdir(output_dir)
# df.to_parquet(output_dir / "accounts.parquet", index=False)

# # Switch back to raw_data for the next table cleaning
# os.chdir(data_dir)

### Clean customers.csv

In [None]:
# df = pd.read_csv("customers.csv")

# df['customer_id'] = df['customer_id'].astype('Int64')

# df['signup_date'] = df['signup_date'].str.strip()

# df['monthly_fee'] = df['monthly_fee'].astype('Int64')

# df['acquisition_cost'] = df['acquisition_cost'].astype('Int64')

# df['churn_date'] = df['churn_date'].str.strip()
# df['churn_date'] = df['churn_date'].str.lower()

# df.to_parquet(output_dir / "customers.parquet", index=False)

### Clean events.csv

In [None]:
# os.chdir(data_dir)
# df = pd.read_csv("events.csv")

# df['event_id'] = df['event_id'].astype('Int64')
# df['user_id'] = df['user_id'].astype('Int64')


# df = df.drop(columns='...6')

# df = clean_str_cols(df, exclude=['occurred_at'])

# df['occurred_at'] = df['occurred_at'].str.strip()

# df.to_parquet(output_dir / "events.parquet", index=False)

In [None]:
for csv in data_dir.glob("*.csv"):
    print(csv.name)

### Clean experiments.csv

In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("experiments.csv")

# # df.head(5)

# # df.dtypes

# df['experiment_id'] = df['experiment_id'].astype("Int64")

# df = clean_str_cols(df, exclude=['start_date', 'end_date'])

# df = clean_date_cols(df, date_cols=['start_date', 'end_date'])

# df.to_parquet(output_dir / "experiments.parquet")


### Clean googleplaystore_user_reviews.csv


In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("googleplaystore_user_reviews.csv")

In [None]:
# # pd.options.display.float_format = "{:.2f}".format

# # df.head()

# # df.dtypes

# df = clean_str_cols(df)

# df['Sentiment'] = (df['Sentiment'].str.lower()).astype('str')

# df['Sentiment_Polarity'] = df['Sentiment_Polarity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [None]:
# df['Sentiment_Subjectivity'] = df['Sentiment_Subjectivity'] \
# .astype("str") \
# .where(df['Sentiment_Polarity'].notna()) \
# .str.replace(r"^0", "0.", regex=True) \
# .astype("float")

In [None]:
# df = df.drop(columns=['App', 'Translated_Review'])

# df.to_parquet(output_dir / "googleplaystore_user_reviews.parquet")

### Clean revenue.csv

In [None]:
# os.chdir(data_dir)

# df = pd.read_csv("revenue.csv")

# df['customer_id'] = df['customer_id'].astype("Int64")
# df['monthly_fee'] = df['monthly_fee'].astype("Int64")

# df = clean_str_cols(df, exclude=['subscription_id', 'month'])

# df['revenue_type'] = df['revenue_type'].str.lower()

# # Format Month
# df = clean_date_cols(df, ['month'], "-")
# df['month'] = df['month'].replace(r"--+", "-", regex=True).str.lower()



# # Format subscription_id
# df = clean_date_cols(df, 'subscription_id')

# df['subscription_id'] = df['subscription_id'].str.extract(r"(S)(\d{4})(\d{6})") \
#                     .apply(lambda x: f"{x[0]}-{x[1]}-{x[2]}" if x.notna().all() else None, axis=1)

# df.to_parquet(output_dir / "revenue.parquet")
    

In [101]:
df['subscription_id']

0      S-1020-202410
1      S-1020-202411
2      S-1020-202412
3      S-1020-202501
4      S-1020-202502
5      S-1020-202503
6      S-1020-202504
7      S-1021-202404
8      S-1021-202405
9               None
10     S-1023-202501
11     S-1023-202502
12     S-1023-202503
13              None
14              None
15     S-1023-202506
16     S-1026-202503
17     S-1026-202504
18     S-1026-202505
19     S-1031-202404
20     S-1031-202405
21     S-1031-202406
22     S-1031-202407
23     S-1031-202408
24     S-1031-202409
25     S-1031-202410
26     S-1031-202411
27     S-1031-202412
28     S-1034-202404
29              None
30     S-1034-202406
31              None
32     S-1034-202408
33     S-1034-202409
34     S-1034-202410
35     S-1034-202411
36     S-1034-202412
37     S-1034-202501
38     S-1034-202502
39     S-1034-202503
40     S-1034-202504
41     S-1035-202407
42              None
43     S-1035-202409
44     S-1035-202410
45     S-1035-202411
46              None
47     S-1035

### Clean subscriptions.csv

In [105]:
# os.chdir(data_dir)

# df = pd.read_csv("subscriptions.csv")

# df.head()

# df['subscription_id'] = (df['subscription_id'].str.upper()).str.replace(r"--+", "", regex=True)

# df['subscription_id'] = df['subscription_id'].str.extract(r"(S-\d{4}).*(\d{6})") \
#                     .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None, axis=1)

# df['customer_id'] = df['customer_id'].astype("Int64")

# df = clean_date_cols(df, 'month')

# df['month'] = df['month'].str.extract(r"(\d{4})(\d{2})") \
#                 .apply(lambda x: f"{x[0]}-{x[1]}" if x.notna().all() else None,axis=1)

# df['monthly_fee'] = df['monthly_fee'].astype("Int64")

# df.to_parquet(output_dir / "subscriptions.parquet")


### Clean users.csv

In [None]:
os.chdir(data_dir)

df = pd.read_csv("users.csv")


In [106]:
df['user_id'] = df['user_id'].astype("Int64")

df['account_id'] = df['account_id'].astype("Int64")

In [109]:
df.dtypes

user_id        Int64
account_id     Int64
email         object
role          object
country       object
created_at    object
is_active     object
dtype: object

In [110]:
df.head()

xd=clean_date_cols(df,'email', ["@", "."])

In [None]:
xd['email'].str.extract(r"^([^\.]+)")

0                alex.white@acct1.io
1             skyler.wilson@acct2.io
2             peyton.miller@acct3.io
3              jamie.moore@acc@t4.io
4             marley.walker@acct4.io
5               casey.perry@acct4.io
6              marley.perez@acct4.io
7               alex.harris@acct5.io
8                                NaN
9                                NaN
10             skyler.clark@acct5.io
11              logan.perez@acct6.io
12             hayden.moore@acct6.io
13                               NaN
14            morgan.miller@acct6.io
15        peyton.hernandez.@acct7.io
16              riley.smith@acct7.io
17          quinn.hernandez@acct7.io
18             elliot.lewis@acct7.io
19              quinn.white@acct7.io
20          peyton.johnson@acct7..io
21          avery.thomp.son@acct7.io
22            leslie.harris@acct7.io
23             marley.smith@acct8.io
24          finley.martinez@acct8.io
25            peyton.martin@acct8.io
26              quinn.clark@acct8.io
2

In [104]:
len(df)

708

In [47]:
for i in data_dir.glob("*.csv"):
    print(i.name)

accounts.csv
customers.csv
events.csv
experiments.csv
googleplaystore_user_reviews.csv
revenue.csv
subscriptions.csv
users.csv
variants.csv
variant_exposures.csv
