# Data Cleaning Using Python

In [68]:
# importing all the necessary libary
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import warnings

In [69]:
warnings.filterwarnings("ignore") # to ignore unnecessary warnings while cleaning data

In [70]:
# loading all the datasets
sales_data = pd.read_csv(r"C:\Users\kumar\OneDrive\Desktop\All_in_one\sales_data_100k.csv")
cus_data = pd.read_csv(r"C:\Users\kumar\OneDrive\Desktop\All_in_one\customers_data_10k.csv")
pro_data = pd.read_csv(r"C:\Users\kumar\OneDrive\Desktop\All_in_one\products_data_1k.csv")

In [71]:
sales_data_copy = sales_data.copy()
cus_data_copy = cus_data.copy()
pro_data_copy = pro_data.copy()

In [72]:
# start with sales data
sales_data

Unnamed: 0,Order_ID,Customer_ID,Product_ID,Amount,Order_Date,Quantity,Discount,Payment_Method,Shipping_Cost,Status
0,10001,6614,819,$3010,8-2023-9,14,10%,Debit Card,$7,Pending
1,10002,5300,524,$1319,2023-10-8,19,15%,Crypto,$30,Returned
2,10003,4376,573,,2023-2-10,5,5%,Crypto,$17,Pending
3,10004,970,1189,,1-2023-24,1,15%,Debit Card,$45,Completed
4,10005,604,1095,$4337,2023-5-9,4,5%,Bank Transfer,,Completed
...,...,...,...,...,...,...,...,...,...,...
99995,109996,6067,459,913$,2023-11-21,1,5%,PayPal,,Cancelled
99996,109997,4636,813,1935$,2023/3/26,9,5%,Credit Card,$22,Completed
99997,109998,4351,344,2323,2023-3-6,14,5%,Credit Card,$28,Pending
99998,109999,4393,327,161$,2023-2-24,14,15%,PayPal,27,Completed


In [73]:
sales_data.isna().sum()

Order_ID              0
Customer_ID           0
Product_ID            0
Amount            25051
Order_Date            0
Quantity              0
Discount          19901
Payment_Method        0
Shipping_Cost     33107
Status                0
dtype: int64

In [74]:
sales_data.columns = sales_data.columns.str.lower() # this code has change all the columns names into lower
sales_data.columns

Index(['order_id', 'customer_id', 'product_id', 'amount', 'order_date',
       'quantity', 'discount', 'payment_method', 'shipping_cost', 'status'],
      dtype='object')

In [75]:
sales_data.order_id.sample(20) # order_id required no cleaning

87598     97599
92659    102660
10666     20667
89361     99362
64230     74231
14501     24502
9736      19737
59724     69725
66924     76925
49416     59417
38223     48224
53172     63173
35554     45555
82276     92277
99478    109479
83848     93849
77565     87566
14475     24476
98834    108835
22556     32557
Name: order_id, dtype: int64

In [76]:
sales_data.order_id.nunique() # all the order_id are unique and primary key contendar

100000

In [77]:
sales_data.customer_id.sample(20) 

28145    3514
89293    7252
10646    1566
69964    1388
8631     2052
94894    7765
9890     5870
46530    9808
17345    2924
28432    4095
28844    6630
36635    8461
86937    5053
44053    8823
93316    1874
94441    2356
3954     3319
31302    9350
16949    8708
23322    2145
Name: customer_id, dtype: int64

In [78]:
sales_data.product_id.value_counts()

product_id
848     134
913     131
884     130
1160    129
319     127
       ... 
774      75
226      75
1078     73
373      73
1006     70
Name: count, Length: 1000, dtype: int64

In [79]:
sales_data[sales_data.amount.isna()] # when we will clean product_data then we will get back to this

Unnamed: 0,order_id,customer_id,product_id,amount,order_date,quantity,discount,payment_method,shipping_cost,status
2,10003,4376,573,,2023-2-10,5,5%,Crypto,$17,Pending
3,10004,970,1189,,1-2023-24,1,15%,Debit Card,$45,Completed
5,10006,1875,594,,5-2023-1,13,10%,Debit Card,46,Cancelled
10,10011,4378,449,,2023-4-4,18,,Bank Transfer,$40,Completed
12,10013,4600,903,,2023-10-12,13,10%,Bank Transfer,40,Pending
...,...,...,...,...,...,...,...,...,...,...
99979,109980,1495,1109,,2023-6-22,3,15%,Credit Card,$11,Pending
99980,109981,2654,696,,2023-7-26,2,,Credit Card,,Completed
99985,109986,9218,362,,2023/7/6,3,20%,PayPal,$35,Pending
99990,109991,170,496,,12-2023-7,10,20%,Crypto,,Returned


In [94]:
sales_data["order_date"] = sales_data.order_date.str.replace("/","-")
sales_data["order_date"].sample(30)

26044      2023-6-7
26065     2023-12-1
75057      4-2023-8
12967     8-2023-11
57720      2023-4-4
66298    12-2023-27
21837     2023-7-14
40676      2023-8-8
94483     2023-2-17
78588     2023-5-28
69967     2023-12-5
57723     9-2023-14
76762    2023-11-21
47629    12-2023-24
12504     2023-8-25
74929      2023-8-6
42330      2023-5-5
31619     9-2023-27
68538    2023-10-18
56529    10-2023-12
31987      2023-8-7
96712    11-2023-22
51705     3-2023-15
96723     4-2023-12
73014     11-2023-6
33948     5-2023-13
58106     2023-6-14
16047     5-2023-24
79383    2023-11-13
89052      7-2023-5
Name: order_date, dtype: object

In [81]:
# sales_data["order_date"].dt('mm-yyyy-dd')
# sales_data["order_date"] = pd.to_datetime(sales_data["order_date"], format="mixed", errors="coerce")
sales_data["order_date"].isna().sum()

np.int64(0)

In [106]:
def convert_dates(date_str):
    try:
        # Check if the format is yyyy-mm-dd (year first)
        if "-" in date_str and len(date_str.split("-")[0]) == 4:
            return pd.to_datetime(date_str, format=r"%Y-%m-%d").strftime(r"%d/%m/%Y")
        # Check if the format is mm-yyyy-dd (month first)
        elif "-" in date_str and len(date_str.split("-")[1]) == 4:
            return pd.to_datetime(date_str, format=r"%m-%Y-%d").strftime(r"%d/%m/%Y")
    except Exception:
        return None  # Return None for unrecognized formats

# Apply the conversion function to the date column
sales_data["order_date"] = sales_data["order_date"].apply(convert_dates)


In [115]:
sales_data.drop(columns="dates",inplace=True)

In [135]:
sales_data.discount.fillna("0",inplace=True)
sales_data["discount"] = sales_data["discount"].str.replace("%","").astype(int)

In [None]:
sales_data.rename(columns={"discount":"discount_in_%"},inplace=True)


In [173]:
sales_data.shipping_cost.fillna("0")

0         $7
1        $30
2        $17
3        $45
4          0
        ... 
99995      0
99996    $22
99997    $28
99998     27
99999      6
Name: shipping_cost, Length: 100000, dtype: object

In [178]:
sales_data["shipping_cost"].str.replace("$","").astype(float).astype(int)

AttributeError: Can only use .str accessor with string values!

In [182]:
sales_data["shipping_cost"].fillna(0,inplace=True)

In [187]:
sales_data.status.value_counts()

status
Returned     25093
Cancelled    25055
Completed    25001
Pending      24851
Name: count, dtype: int64

In [189]:
DATABASE_URI = "mysql+pymysql://root:Ketan@localhost:3306/sales_analysis_project"
engine = create_engine(DATABASE_URI)
sales_data.to_sql("sales_data",con = engine,index = False,if_exists='replace')


100000