# Preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Gathering Data

In [2]:
customers_df = pd.read_csv("https://raw.githubusercontent.com/dicodingacademy/dicoding_dataset/main/DicodingCollection/customers.csv")
orders_df = pd.read_csv("https://raw.githubusercontent.com/dicodingacademy/dicoding_dataset/main/DicodingCollection/orders.csv")
products_df = pd.read_csv("https://raw.githubusercontent.com/dicodingacademy/dicoding_dataset/main/DicodingCollection/products.csv")
sales_df = pd.read_csv("https://raw.githubusercontent.com/dicodingacademy/dicoding_dataset/main/DicodingCollection/sales.csv")


In [3]:
customers_df.tail()

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
1002,996,fulan 996,Prefer not to say,59,0433 Armstrong HillSuite 974,7613,Lake Danielland,Tasmania,Australia
1003,997,fulan 997,Prefer not to say,30,04 Howell PassSuite 209,6950,Ellaborough,Tasmania,Australia
1004,998,fulan 998,Prefer not to say,32,72 Annabelle PassApt. 446,52,Kohlerberg,Queensland,Australia
1005,999,fulan 999,Prefer not to say,30,170 Wilson AvenueApt. 577,7849,East Oscarfurt,Western Australia,Australia
1006,1000,fulan 1000,Male,71,1671 Lauren KnollSuite 945,9012,Lake Audreyborough,Tasmania,Australia


In [4]:
orders_df.tail()

Unnamed: 0,order_id,customer_id,payment,order_date,delivery_date
995,996,345,37843,2021-1-13,2021-02-02
996,997,346,53831,2021-1-18,2021-01-31
997,998,407,53308,2021-5-5,2021-05-21
998,999,428,31643,2021-6-15,2021-07-12
999,1000,896,27836,2021-4-7,2021-04-24


In [5]:
products_df.tail()

Unnamed: 0,product_id,product_type,product_name,size,colour,price,quantity,description
1261,1255,Trousers,Tracksuit Bottoms,XS,violet,91,67,"A violet coloured, XS sized, Tracksuit Bottoms..."
1262,1256,Trousers,Tracksuit Bottoms,S,violet,91,48,"A violet coloured, S sized, Tracksuit Bottoms ..."
1263,1257,Trousers,Tracksuit Bottoms,M,violet,91,73,"A violet coloured, M sized, Tracksuit Bottoms ..."
1264,1258,Trousers,Tracksuit Bottoms,L,violet,91,45,"A violet coloured, L sized, Tracksuit Bottoms ..."
1265,1259,Trousers,Tracksuit Bottoms,XL,violet,91,60,"A violet coloured, XL sized, Tracksuit Bottoms..."


In [6]:
sales_df.tail()

Unnamed: 0,sales_id,order_id,product_id,price_per_unit,quantity,total_price
4995,4995,998,321,109,2,218.0
4996,4996,998,251,95,3,285.0
4997,4997,999,872,113,1,113.0
4998,4998,999,998,106,2,212.0
4999,4999,999,1105,115,1,115.0


# Assesing Data

### Customers table

In [7]:
customers_df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    1007 non-null   int64 
 1   customer_name  1007 non-null   object
 2   gender         989 non-null    object
 3   age            1007 non-null   int64 
 4   home_address   1007 non-null   object
 5   zip_code       1007 non-null   int64 
 6   city           1007 non-null   object
 7   state          1007 non-null   object
 8   country        1007 non-null   object
dtypes: int64(3), object(6)
memory usage: 70.9+ KB


There is a slight difference in the gender's amount. it shows that there are missing values in it. let's check

In [8]:
customers_df.isna().sum()

customer_id       0
customer_name     0
gender           18
age               0
home_address      0
zip_code          0
city              0
state             0
country           0
dtype: int64

Checking the data duplication

In [9]:
print("Total data duplication:",customers_df.duplicated().sum())

Total data duplication: 6


Checking all of the statistics parameters using describe() function

In [10]:
customers_df.describe()

Unnamed: 0,customer_id,age,zip_code
count,1007.0,1007.0,1007.0
mean,501.726912,50.929494,5012.538232
std,288.673238,30.516299,2885.836112
min,1.0,20.0,2.0
25%,252.5,34.0,2403.5
50%,502.0,50.0,5087.0
75%,751.5,65.0,7493.5
max,1000.0,700.0,9998.0


There is an oddity in the max value of age column. 700 years old, whatt??

### Orders table

In [11]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   order_id       1000 non-null   int64 
 1   customer_id    1000 non-null   int64 
 2   payment        1000 non-null   int64 
 3   order_date     1000 non-null   object
 4   delivery_date  1000 non-null   object
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


the amount of data is complete, no missing values in it. But there's a type error on delivery_date. It represents the date as an object instead of datetime datatype

Let's check the data duplication

In [12]:
print("Total data duplication:", orders_df.duplicated().sum())

Total data duplication: 0


Now we check the statistics parameters of the data

In [13]:
orders_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
order_id,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
customer_id,1000.0,506.64,277.115502,1.0,275.25,515.0,737.25,1000.0
payment,1000.0,33972.936,14451.609047,10043.0,21329.25,33697.5,46249.0,59910.0


Nothing's odd from all of these parameters

### Products table

First, look at the data info

In [14]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    1266 non-null   int64 
 1   product_type  1266 non-null   object
 2   product_name  1266 non-null   object
 3   size          1266 non-null   object
 4   colour        1266 non-null   object
 5   price         1266 non-null   int64 
 6   quantity      1266 non-null   int64 
 7   description   1266 non-null   object
dtypes: int64(3), object(5)
memory usage: 79.3+ KB


since it seems there is no problem in it, let's check the duplication

In [15]:
print("Total data duplication:", products_df.duplicated().sum())

Total data duplication: 6


Data describe

In [16]:
products_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
product_id,1266.0,627.92654,363.971586,0.0,313.25,626.5,942.75,1259.0
price,1266.0,105.812006,9.715611,90.0,95.25,109.0,114.0,119.0
quantity,1266.0,60.138231,11.682791,40.0,50.0,60.0,70.0,80.0


### Sales table

Look at the data information

In [18]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sales_id        5000 non-null   int64  
 1   order_id        5000 non-null   int64  
 2   product_id      5000 non-null   int64  
 3   price_per_unit  5000 non-null   int64  
 4   quantity        5000 non-null   int64  
 5   total_price     4981 non-null   float64
dtypes: float64(1), int64(5)
memory usage: 234.5 KB


It seems there's a missing values in total_price column

In [19]:
sales_df.isna().sum()

sales_id           0
order_id           0
product_id         0
price_per_unit     0
quantity           0
total_price       19
dtype: int64

In [22]:
print("Total data duplication:", sales_df.duplicated().sum())

Total data duplication: 0


In [23]:
sales_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sales_id,5000.0,2499.5,1443.520003,0.0,1249.75,2499.5,3749.25,4999.0
order_id,5000.0,503.0382,285.964418,1.0,258.0,504.5,749.0,999.0
product_id,5000.0,634.0532,363.255794,1.0,323.0,635.0,951.0,1259.0
price_per_unit,5000.0,103.5016,9.195004,90.0,95.0,102.0,112.0,119.0
quantity,5000.0,1.9924,0.80751,1.0,1.0,2.0,3.0,3.0
total_price,4981.0,206.307368,86.352449,90.0,112.0,204.0,285.0,357.0


No abnormalities within the data

### Data assesment conclusions

![image.png](attachment:e134a637-1290-4e49-9562-dc5cba56ad5e.png)

# Cleaning Data

In [152]:
def check_duplicate(data):
    print(f"Total Data Duplication:", data.duplicated().sum())

### Cleaning customer_df data 

#### Eliminating Duplicate Data

In [153]:
customer_df_noduplicate = customers_df.drop_duplicates()
check_duplicate(customer_df_noduplicate)

Total Data Duplication: 0


#### Handling Missing Values in gender column

In [154]:
customer_df_noduplicate[customer_df_noduplicate["gender"].isna()]

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
38,39,fulan 39,,80,7440 Cameron Estate DrSuite 628,4622,North Victoriachester,Northern Territory,Australia
167,168,fulan 168,,27,2781 Berge MallSuite 452,1975,North Leoburgh,Western Australia,Australia
322,322,fulan 322,,30,593 Becker CircleApt. 333,1640,Jacobiview,Western Australia,Australia
393,393,fulan 393,,34,5158 Levi HillSuite 531,1474,Johnsburgh,Queensland,Australia
442,442,fulan 442,,26,5157 Feil RoadApt. 633,7249,Port Chloe,New South Wales,Australia
722,720,fulan 720,,40,31 Jordan ParadeApt. 400,1380,West Henry,South Australia,Australia
745,743,fulan 743,,57,09 Christopher StreetSuite 967,6226,Lake Lukemouth,Western Australia,Australia
773,771,fulan 771,,74,7367 Wright JunctionApt. 773,8882,Kuhntown,Victoria,Australia
798,795,fulan 795,,49,487 Summer MewsApt. 874,1712,East Hayden,Australian Capital Territory,Australia
801,798,fulan 798,,56,27 Aiden KnollApt. 875,6531,Port Sam,Australian Capital Territory,Australia


#### Find the most dominant missing value in gender column using value_counts()

In [155]:
customer_df_noduplicate["gender"].value_counts()

gender
Prefer not to say    725
Male                 143
Female               115
Name: count, dtype: int64

We can fill its missing values as it is

In [156]:
no_missing_customers_df = customer_df_noduplicate.fillna(value="Prefer not to say")
no_missing_customers_df.isna().sum()

customer_id      0
customer_name    0
gender           0
age              0
home_address     0
zip_code         0
city             0
state            0
country          0
dtype: int64

#### Handling the inaccurate values of age column

In [157]:
def find_max_df(data, col):
    return data[data[col] == data[col].max()]

In [158]:
find_max(no_missing_customers_df, "age")

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
967,961,fulan 961,Prefer not to say,700,29 Farrell ParadeSuite 818,6528,New Joseph,South Australia,Australia


we can assume that the inaccurate value occured due to human error. We can replace it with 70

In [159]:
clean_cust_df = no_missing_customers_df.replace(no_missing_customers_df["age"].max(), 70)
find_max(clean_cust_df, "age")

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
215,216,fulan 216,Prefer not to say,500,038 Haley MewsApt. 810,3991,Bayertown,Northern Territory,Australia


In [160]:
clean_cust_df.describe()

Unnamed: 0,customer_id,age,zip_code
count,1001.0,1001.0,1001.0
mean,500.312687,50.323676,5000.693307
std,289.265537,22.665946,2886.084454
min,1.0,20.0,2.0
25%,250.0,34.0,2398.0
50%,500.0,50.0,5079.0
75%,751.0,65.0,7454.0
max,1000.0,500.0,9998.0


There is still an odd data in age column, let's replace it

In [161]:
clean_cust_df_2 = clean_cust_df.replace(clean_cust_df["age"].max(), 50)
find_max(clean_cust_df_2, "age")

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
34,35,fulan 35,Male,80,3168 Bartoletti CrescentSuite 878,593,Port Lucas,Queensland,Australia
38,39,fulan 39,Prefer not to say,80,7440 Cameron Estate DrSuite 628,4622,North Victoriachester,Northern Territory,Australia
142,143,fulan 143,Prefer not to say,80,87 Hahn Station StSuite 943,8254,New Ella,Victoria,Australia
154,155,fulan 155,Male,80,85 Charles MallSuite 424,7841,Port Sophia,New South Wales,Australia
170,171,fulan 171,Prefer not to say,80,16 Boyer WaySuite 018,6226,South Gabriel,Western Australia,Australia
174,175,fulan 175,Prefer not to say,80,57 David MallSuite 596,3129,Haneton,South Australia,Australia
181,182,fulan 182,Male,80,32 Thomas CrestSuite 753,2147,Archiefurt,Queensland,Australia
235,236,fulan 236,Prefer not to say,80,64 Phillips RunApt. 722,2752,East Sienna,New South Wales,Australia
424,424,fulan 424,Prefer not to say,80,03 Grant CrestSuite 228,5743,Lake Charliemouth,Western Australia,Australia
438,438,fulan 438,Prefer not to say,80,569 Alyssa IslandApt. 766,6893,Hunterland,Western Australia,Australia


In [162]:
clean_cust_df_2.describe()

Unnamed: 0,customer_id,age,zip_code
count,1001.0,1001.0,1001.0
mean,499.863137,49.874126,5000.693307
std,289.615487,17.644663,2886.084454
min,1.0,20.0,2.0
25%,249.0,34.0,2398.0
50%,499.0,50.0,5079.0
75%,751.0,65.0,7454.0
max,1000.0,80.0,9998.0


In [166]:
clean_cust_df_2[clean_cust_df_2.duplicated()]

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
949,943,fulan 943,Prefer not to say,64,3117 Heller PlaceSuite 149,822,North Elijah,South Australia,Australia


In [167]:
clean_cust_df_2[clean_cust_df_2["customer_id"] == 943]

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
948,943,fulan 943,Prefer not to say,64,3117 Heller PlaceSuite 149,822,North Elijah,South Australia,Australia
949,943,fulan 943,Prefer not to say,64,3117 Heller PlaceSuite 149,822,North Elijah,South Australia,Australia


There's a duplicate data again. Let's drop it

In [168]:
clean_cust_df_final = clean_cust_df_2.drop_duplicates() 
check_duplicate(clean_cust_df_final)

Total Data Duplication: 0


### Cleaning orders_df data

In [123]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   order_id       1000 non-null   int64 
 1   customer_id    1000 non-null   int64 
 2   payment        1000 non-null   int64 
 3   order_date     1000 non-null   object
 4   delivery_date  1000 non-null   object
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


Since the orders_df has a problem with order_date and delivery_date datatype, we will change them.

In [125]:
datetime_cols = ["order_date", "delivery_date"]

# to keep safe the original in case there is something went wrong
new_orders_df = orders_df

for col in datetime_cols:
    new_orders_df[col] = pd.to_datetime(new_orders_df[col])

new_orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_id       1000 non-null   int64         
 1   customer_id    1000 non-null   int64         
 2   payment        1000 non-null   int64         
 3   order_date     1000 non-null   datetime64[ns]
 4   delivery_date  1000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(3)
memory usage: 39.2 KB


### Cleaning product_df data

The problem with this table is the duplicate data. Let's take a look

In [127]:
check_duplicate(products_df)

Total Data Duplication: 6


In [134]:
clean_products_df = products_df.drop_duplicates()
check_duplicate(clean_products_df)

Total Data Duplication: 0


### Cleaning sales_df data

There're missing values within this table, let's take a look

In [135]:
sales_df.isna().sum()

sales_id           0
order_id           0
product_id         0
price_per_unit     0
quantity           0
total_price       19
dtype: int64

In [137]:
sales_df[sales_df["total_price"].isna()]

Unnamed: 0,sales_id,order_id,product_id,price_per_unit,quantity,total_price
9,9,2,1196,105,1,
121,121,27,1027,90,3,
278,278,63,360,94,2,
421,421,95,1091,115,1,
489,489,108,1193,105,3,
539,539,117,405,119,2,
636,636,134,653,93,3,
687,687,145,1138,102,1,
854,854,177,64,104,1,
1079,1079,222,908,94,3,


Let's take a look at some of filled data

In [140]:
sales_df[sales_df["total_price"].notna()].head()

Unnamed: 0,sales_id,order_id,product_id,price_per_unit,quantity,total_price
0,0,1,218,106,2,212.0
1,1,1,481,118,1,118.0
2,2,1,2,96,3,288.0
3,3,1,1002,106,2,212.0
4,4,1,691,113,3,339.0


In [142]:
106*2, 118*1, 96*3

(212, 118, 288)

As we can see that the total_price can be found with multiplying the price_per_unit and quantity. 

Since the missing value of sales_df dataset only in the total_price column, we can fill it with multiply the price_per_unit and quantity cols.

In [144]:
clean_sales_df = sales_df
clean_sales_df["total_price"] = clean_sales_df["price_per_unit"] * clean_sales_df["quantity"]
clean_sales_df.isna().sum()

sales_id          0
order_id          0
product_id        0
price_per_unit    0
quantity          0
total_price       0
dtype: int64

In [145]:
clean_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   sales_id        5000 non-null   int64
 1   order_id        5000 non-null   int64
 2   product_id      5000 non-null   int64
 3   price_per_unit  5000 non-null   int64
 4   quantity        5000 non-null   int64
 5   total_price     5000 non-null   int64
dtypes: int64(6)
memory usage: 234.5 KB


In [147]:
clean_sales_df_2 = sales_df.replace(sales_df["total_price"].isna(), sales_df["price_per_unit"] * sales_df["quantity"])
clean_sales_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   sales_id        5000 non-null   int64
 1   order_id        5000 non-null   int64
 2   product_id      5000 non-null   int64
 3   price_per_unit  5000 non-null   int64
 4   quantity        5000 non-null   int64
 5   total_price     5000 non-null   int64
dtypes: int64(6)
memory usage: 234.5 KB


In [148]:
clean_sales_df_2.isna().sum()

sales_id          0
order_id          0
product_id        0
price_per_unit    0
quantity          0
total_price       0
dtype: int64

# Final Result

In [179]:
def check_all(data):
    print("========= Info =========")
    data.info()
    print()

    print("========= Describe =========")
    print(data.describe())
    print()

    print("========= Missing Value =========")
    print(data.isna().sum())
    print()
    
    print("========= Duplicated Value =========")
    check_duplicate(data)

## Customers Table

In [180]:
check_all(clean_cust_df_final)

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 1006
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    1000 non-null   int64 
 1   customer_name  1000 non-null   object
 2   gender         1000 non-null   object
 3   age            1000 non-null   int64 
 4   home_address   1000 non-null   object
 5   zip_code       1000 non-null   int64 
 6   city           1000 non-null   object
 7   state          1000 non-null   object
 8   country        1000 non-null   object
dtypes: int64(3), object(6)
memory usage: 78.1+ KB

       customer_id          age     zip_code
count  1000.000000  1000.000000  1000.000000
mean    499.420000    49.860000  5004.872000
std     289.420676    17.647828  2884.497332
min       1.000000    20.000000     2.000000
25%     248.750000    34.000000  2401.750000
50%     498.500000    50.000000  5083.000000
75%     750.250000    65.000000  7460.250000
max    1000.00

## Orders Table

In [181]:
check_all(new_orders_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_id       1000 non-null   int64         
 1   customer_id    1000 non-null   int64         
 2   payment        1000 non-null   int64         
 3   order_date     1000 non-null   datetime64[ns]
 4   delivery_date  1000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(3)
memory usage: 39.2 KB

          order_id  customer_id       payment                  order_date  \
count  1000.000000  1000.000000   1000.000000                        1000   
mean    500.500000   506.640000  33972.936000  2021-05-27 18:38:52.800000   
min       1.000000     1.000000  10043.000000         2021-01-01 00:00:00   
25%     250.750000   275.250000  21329.250000         2021-03-13 18:00:00   
50%     500.500000   515.000000  33697.500000         2021-05-27 12:00:00   
75%     750.25000

## Products Table

In [182]:
check_all(clean_products_df)

<class 'pandas.core.frame.DataFrame'>
Index: 1260 entries, 0 to 1265
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    1260 non-null   int64 
 1   product_type  1260 non-null   object
 2   product_name  1260 non-null   object
 3   size          1260 non-null   object
 4   colour        1260 non-null   object
 5   price         1260 non-null   int64 
 6   quantity      1260 non-null   int64 
 7   description   1260 non-null   object
dtypes: int64(3), object(5)
memory usage: 88.6+ KB

        product_id        price     quantity
count  1260.000000  1260.000000  1260.000000
mean    629.500000   105.805556    60.150000
std     363.874979     9.704423    11.670573
min       0.000000    90.000000    40.000000
25%     314.750000    95.750000    50.000000
50%     629.500000   108.500000    60.000000
75%     944.250000   114.000000    70.000000
max    1259.000000   119.000000    80.000000

product_id      0
pro

## Sales Table

In [183]:
check_all(clean_sales_df_2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   sales_id        5000 non-null   int64
 1   order_id        5000 non-null   int64
 2   product_id      5000 non-null   int64
 3   price_per_unit  5000 non-null   int64
 4   quantity        5000 non-null   int64
 5   total_price     5000 non-null   int64
dtypes: int64(6)
memory usage: 234.5 KB

          sales_id     order_id   product_id  price_per_unit    quantity  \
count  5000.000000  5000.000000  5000.000000     5000.000000  5000.00000   
mean   2499.500000   503.038200   634.053200      103.501600     1.99240   
std    1443.520003   285.964418   363.255794        9.195004     0.80751   
min       0.000000     1.000000     1.000000       90.000000     1.00000   
25%    1249.750000   258.000000   323.000000       95.000000     1.00000   
50%    2499.500000   504.500000   635.000000      102

# To csv

In [186]:
clean_cust_df_final.to_csv("clean_customers_df.csv", index=None)
new_orders_df.to_csv("clean_orders_df.csv", index=None)
clean_products_df.to_csv("clean_products_df.csv", index=None)
clean_sales_df_2.to_csv("clean_sales_df.csv", index=None)