In [1]:
import os
import pandas as pd
import numpy as np

## suppliers dataframe

In [2]:
suppliers_df = pd.read_csv("./resources/sales_suppliers.csv")
missing_suppliers = pd.read_csv("./resources/sales_suppliers_missing_suppliers.csv")


In [3]:
suppliers_df.head()

Unnamed: 0,supplierID,name,ingredient,continent,city,district,size,longitude,latitude,approved
0,4000000,Cacao Wonders,cacao,South America,Guayaquil,Las Peñas,M,-79.8974,-2.1791,Y
1,4000001,Coconut Grove,coconut,Asia,Manila,Intramuros,S,121.0221,14.6042,Y
2,4000002,Almond Delights,almonds,Europe,Valencia,Ruzafa,L,-0.3762,39.4699,Y
3,4000003,Sugar Cane Harvest,cane sugar,South America,Sao Paulo,Vila Madalena,XL,-46.6333,-23.5489,Y
4,4000004,Vanilla Valley,vanilla,North America,Mexico City,Roma Norte,M,-99.1332,19.4326,Y


In [4]:
missing_suppliers.head()

Unnamed: 0,supplierID,Name,Ingredient,City,Continent
0,4000027,Wheat Flour Co.,flour,Tokyo,Asia
1,4000028,Sweet Sugar Co.,sugar,New York,North America
2,4000029,Dairy Butter Co.,butter,Nagoya,Asia
3,4000030,Poultry Eggs Co.,eggs,New York,North America
4,4000031,Extract Vanilla Co.,vanilla,Nagoya,Asia


In [5]:
## merger suppliers
suppliers_df.columns = suppliers_df.columns.str.lower()
missing_suppliers.columns = missing_suppliers.columns.str.lower()

merge_suppliers = pd.concat([suppliers_df, missing_suppliers])

In [6]:
merge_suppliers.sample

<bound method NDFrame.sample of     supplierid                     name    ingredient        continent  \
0      4000000            Cacao Wonders         cacao    South America   
1      4000001            Coconut Grove       coconut             Asia   
2      4000002          Almond Delights       almonds           Europe   
3      4000003       Sugar Cane Harvest    cane sugar    South America   
4      4000004           Vanilla Valley       vanilla    North America   
5      4000005          Pecan Pleasures        pecans    North America   
6      4000006           Hazelnut Haven     hazelnuts           Europe   
7      4000007           Cinnamon Spice      cinnamon             Asia   
8      4000008            Cashew Corner       cashews             Asia   
9      4000009            Maple Monarch   maple syrup    North America   
10     4000010         Pistachio Palace    pistachios             Asia   
11     4000011                Oat Oasis          oats           Europe   
12    

In [7]:
merge_suppliers= merge_suppliers.rename(columns={'supplierID': 'supplier_id'})
merge_suppliers.columns = merge_suppliers.columns.str.lower()

merge_suppliers["approved"] = merge_suppliers["approved"].map({"Y": True, "N": False})
merge_suppliers = merge_suppliers.astype({
    col: "string" for col in suppliers_df.columns
    if col != "supplier_id" and col != "latitude" and col != "longitude" and col != "approved"
})
merge_suppliers = merge_suppliers.astype({"latitude": "float64", "longitude": "float64", "approved": "boolean"})

merge_suppliers.info()
merge_suppliers.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 48 entries, 0 to 20
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   supplierid  48 non-null     string 
 1   name        48 non-null     string 
 2   ingredient  48 non-null     string 
 3   continent   48 non-null     string 
 4   city        48 non-null     string 
 5   district    27 non-null     string 
 6   size        27 non-null     string 
 7   longitude   27 non-null     float64
 8   latitude    27 non-null     float64
 9   approved    27 non-null     boolean
dtypes: boolean(1), float64(2), string(7)
memory usage: 3.8 KB


supplierid     0
name           0
ingredient     0
continent      0
city           0
district      21
size          21
longitude     21
latitude      21
approved      21
dtype: int64

## transactions dataframe

In [8]:
transactions_df = pd.read_csv("./resources/sales_transactions.csv")
transactions_df.head()

Unnamed: 0,transactionID,customerID,franchiseID,dateTime,product,quantity,unitPrice,totalPrice,paymentMethod,cardNumber
0,2002961,1000253,3000047,2024-05-14T12:17:01.495Z,Golden Gate Ginger,8,3,24,amex,378154478982993
1,2003007,1000226,3000047,2024-05-10T23:10:10.239Z,Austin Almond Biscotti,36,3,108,mastercard,2244626981238094
2,2003017,1000108,3000047,2024-05-16T16:34:10.613Z,Austin Almond Biscotti,40,3,120,mastercard,2490570234487424
3,2003068,1000173,3000047,2024-05-02T04:31:51.612Z,Pearly Pies,28,3,84,amex,343808569426192
4,2003103,1000075,3000047,2024-05-04T23:44:26.902Z,Pearly Pies,28,3,84,visa,4377080942201798


In [9]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   transactionID  3333 non-null   int64 
 1   customerID     3333 non-null   int64 
 2   franchiseID    3333 non-null   int64 
 3   dateTime       3333 non-null   object
 4   product        3333 non-null   object
 5   quantity       3333 non-null   int64 
 6   unitPrice      3333 non-null   int64 
 7   totalPrice     3333 non-null   int64 
 8   paymentMethod  3333 non-null   object
 9   cardNumber     3333 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 260.5+ KB


In [10]:
transactions_df = transactions_df.rename(columns={
    "transactionID": "transaction_id",
    "customerID": "customer_id",
    "franchiseID": "franchise_id",
    "dateTime": "date_time",
    "unitPrice": "unit_price",
    "totalPrice": "total_price",
    "paymentMethod": "payment_method",
    "cardNumber": "card_number",
})
transactions_df.columns = transactions_df.columns.str.lower()

transactions_df['date_time'] = pd.to_datetime(transactions_df['date_time'])

transactions_df = transactions_df.astype({
    "transaction_id": "int64",
    "customer_id": "int64",
    "franchise_id": "int64",
    "product": "string",
    "quantity": "int64",
    "unit_price": "float64",
    "total_price": "float64",
    "payment_method": "string",
    "card_number": "string",
})

transactions_df.info()
transactions_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   transaction_id  3333 non-null   int64              
 1   customer_id     3333 non-null   int64              
 2   franchise_id    3333 non-null   int64              
 3   date_time       3333 non-null   datetime64[ns, UTC]
 4   product         3333 non-null   string             
 5   quantity        3333 non-null   int64              
 6   unit_price      3333 non-null   float64            
 7   total_price     3333 non-null   float64            
 8   payment_method  3333 non-null   string             
 9   card_number     3333 non-null   string             
dtypes: datetime64[ns, UTC](1), float64(2), int64(4), string(3)
memory usage: 260.5 KB


transaction_id    0
customer_id       0
franchise_id      0
date_time         0
product           0
quantity          0
unit_price        0
total_price       0
payment_method    0
card_number       0
dtype: int64

## franchises dataframe

In [11]:
franchises_df = pd.read_csv("./resources/sales_franchises.csv")
franchises_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   franchiseID  48 non-null     int64  
 1   name         48 non-null     object 
 2   city         48 non-null     object 
 3   district     48 non-null     object 
 4   zipcode      48 non-null     object 
 5   country      48 non-null     object 
 6   size         48 non-null     object 
 7   longitude    48 non-null     float64
 8   latitude     48 non-null     float64
 9   supplierID   48 non-null     int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 3.9+ KB


In [12]:
franchises_df = franchises_df.rename(columns={"franchiseID": "franchise_id", "supplierID": "supplier_id"})
franchises_df.columns = franchises_df.columns.str.lower()

franchises_df = franchises_df.astype({
    col: "string" for col in franchises_df.columns
    if col != "franchise_id" and col != "supplier_id" and col != "longitude" and col != "latitude"
})


franchises_df.info()
franchises_df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   franchise_id  48 non-null     int64  
 1   name          48 non-null     string 
 2   city          48 non-null     string 
 3   district      48 non-null     string 
 4   zipcode       48 non-null     string 
 5   country       48 non-null     string 
 6   size          48 non-null     string 
 7   longitude     48 non-null     float64
 8   latitude      48 non-null     float64
 9   supplier_id   48 non-null     int64  
dtypes: float64(2), int64(2), string(6)
memory usage: 3.9 KB


franchise_id    0
name            0
city            0
district        0
zipcode         0
country         0
size            0
longitude       0
latitude        0
supplier_id     0
dtype: int64

## customer dataframe

In [13]:
customer_df = pd.read_csv("./resources/sales_customers.csv")
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   customerID       300 non-null    int64 
 1   first_name       300 non-null    object
 2   last_name        300 non-null    object
 3   email_address    300 non-null    object
 4   phone_number     300 non-null    object
 5   address          300 non-null    object
 6   city             300 non-null    object
 7   state            300 non-null    object
 8   country          300 non-null    object
 9   continent        300 non-null    object
 10  postal_zip_code  300 non-null    int64 
 11  gender           300 non-null    object
dtypes: int64(2), object(10)
memory usage: 28.3+ KB


In [14]:
customer_df.head()

Unnamed: 0,customerID,first_name,last_name,email_address,phone_number,address,city,state,country,continent,postal_zip_code,gender
0,2000259,Kayla,Barrett,brittanyramos@example.org,349-683-9514x73065,717 Whitney Roads,Kathrynborough,Massachusetts,Japan,Asia,81587,female
1,2000260,Amanda,Reed,scollier@example.org,+1-999-308-9110,69075 Logan Circles Apt. 540,East Catherine,Rhode Island,Japan,Asia,6657,female
2,2000261,Steven,Tanner,haileysanchez@example.net,859-946-4140x24086,08560 Thomas Land,Williamshire,Missouri,Japan,Asia,20642,female
3,2000262,Jennifer,Forbes,belldonna@example.com,633-427-4977,5840 Warren Garden Suite 901,Delacruzville,Nevada,Australia,Oceania,21440,male
4,2000263,Kenneth,Berger,bdalton@example.net,(831)220-1833x906,693 Baker Dale,West Wendy,Colorado,Australia,Oceania,32756,female


In [15]:
customer_df = customer_df.rename(columns={"customerID": "customer_id"})

In [16]:
customer_df["customer_id"] = customer_df["customer_id"].astype(str).replace("^2", "1", regex=True).astype(int)
customer_df.head()

Unnamed: 0,customer_id,first_name,last_name,email_address,phone_number,address,city,state,country,continent,postal_zip_code,gender
0,1000259,Kayla,Barrett,brittanyramos@example.org,349-683-9514x73065,717 Whitney Roads,Kathrynborough,Massachusetts,Japan,Asia,81587,female
1,1000260,Amanda,Reed,scollier@example.org,+1-999-308-9110,69075 Logan Circles Apt. 540,East Catherine,Rhode Island,Japan,Asia,6657,female
2,1000261,Steven,Tanner,haileysanchez@example.net,859-946-4140x24086,08560 Thomas Land,Williamshire,Missouri,Japan,Asia,20642,female
3,1000262,Jennifer,Forbes,belldonna@example.com,633-427-4977,5840 Warren Garden Suite 901,Delacruzville,Nevada,Australia,Oceania,21440,male
4,1000263,Kenneth,Berger,bdalton@example.net,(831)220-1833x906,693 Baker Dale,West Wendy,Colorado,Australia,Oceania,32756,female


In [17]:
customer_df.columns = customer_df.columns.str.lower()
customer_df = customer_df.astype({
    col: "string" for col in customer_df.columns
    if col != "customer_id"
})

customer_df.info()
# check null
customer_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   customer_id      300 non-null    int64 
 1   first_name       300 non-null    string
 2   last_name        300 non-null    string
 3   email_address    300 non-null    string
 4   phone_number     300 non-null    string
 5   address          300 non-null    string
 6   city             300 non-null    string
 7   state            300 non-null    string
 8   country          300 non-null    string
 9   continent        300 non-null    string
 10  postal_zip_code  300 non-null    string
 11  gender           300 non-null    string
dtypes: int64(1), string(11)
memory usage: 28.3 KB


customer_id        0
first_name         0
last_name          0
email_address      0
phone_number       0
address            0
city               0
state              0
country            0
continent          0
postal_zip_code    0
gender             0
dtype: int64

## gold review dataframe

In [18]:
gold_review_df = pd.read_csv("./resources/media_gold_reviews_chunked.csv")
gold_review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   franchiseID   196 non-null    int64 
 1   review_date   196 non-null    object
 2   chunked_text  196 non-null    object
 3   chunk_id      196 non-null    object
 4   review_uri    196 non-null    object
dtypes: int64(1), object(4)
memory usage: 7.8+ KB


In [19]:
# change column name
gold_review_df = gold_review_df.rename(columns={'franchiseID': 'franchise_id'})
# change column to lower case
gold_review_df.columns = gold_review_df.columns.str.lower()
# change column type
gold_review_df = gold_review_df.astype({'chunked_text': 'string', 'chunk_id': 'string', 'review_uri': 'string'})
gold_review_df['review_date'] = pd.to_datetime(gold_review_df['review_date'])
# check info
gold_review_df.info()
# check null
gold_review_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   franchise_id  196 non-null    int64              
 1   review_date   196 non-null    datetime64[ns, UTC]
 2   chunked_text  196 non-null    string             
 3   chunk_id      196 non-null    string             
 4   review_uri    196 non-null    string             
dtypes: datetime64[ns, UTC](1), int64(1), string(3)
memory usage: 7.8 KB


franchise_id    0
review_date     0
chunked_text    0
chunk_id        0
review_uri      0
dtype: int64

## review dataframe

In [20]:
review_df = pd.read_csv("./resources/media_customer_reviews.csv")
review_df.head()

Unnamed: 0,review,franchiseID,review_date,new_id
0,Title: A Delightful Cookie Experience at Bakeh...,3000037,2024-05-20T17:24:06.591Z,1
1,"""Sweet tooth heaven on East 6th Street! I'm ob...",3000017,2024-05-20T17:17:03.052Z,2
2,**4.5/5 stars**\n\nI stumbled upon Bakehouse i...,3000007,2024-05-20T17:17:03.052Z,3
3,"Bakehouse in Fitzroy, Melbourne, has disappoin...",3000003,2024-05-31T15:13:36.962Z,4
4,Title: A Sweet Escape in Las Vegas' Arts Distr...,3000034,2024-05-20T17:24:06.591Z,5


In [21]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review       204 non-null    object
 1   franchiseID  204 non-null    int64 
 2   review_date  204 non-null    object
 3   new_id       204 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 6.5+ KB


In [22]:
# change column name
review_df = review_df.rename(columns={'franchiseID': 'franchise_id'})
# change column to lower case
review_df.columns = review_df.columns.str.lower()
# change column type
review_df = review_df.astype({'review': 'string'})
review_df['review_date'] = pd.to_datetime(review_df['review_date'])
# organise column order
review_df = review_df[['new_id', 'franchise_id', 'review_date', 'review']]
# check info
review_df.info()
# check null
review_df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   new_id        204 non-null    int64              
 1   franchise_id  204 non-null    int64              
 2   review_date   204 non-null    datetime64[ns, UTC]
 3   review        204 non-null    string             
dtypes: datetime64[ns, UTC](1), int64(2), string(1)
memory usage: 6.5 KB


new_id          0
franchise_id    0
review_date     0
review          0
dtype: int64

## save file

In [24]:
folder = "./cleaned_resources/"
review_df = review_df.to_csv(f"{folder}media_customer_reviews.csv", index=False)
gold_review_df = gold_review_df.to_csv(f"{folder}media_gold_reviews_chunked.csv", index=False)
customer_df = customer_df.to_csv(f"{folder}sales_customers.csv", index=False)
franchises_df = franchises_df.to_csv(f"{folder}sales_franchises.csv", index=False)
merge_suppliers= merge_suppliers.to_csv(f"{folder}sales_suppliers.csv", index=False)
transactions_df = transactions_df.to_csv(f"{folder}sales_transactions.csv", index=False)