<a href="https://www.kaggle.com/code/hassanabsar/amazon-deals-today-s-deals-data-cleaning?scriptVersionId=173604033" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Amazon | Deals (Today's Deals) - Data Cleaning

In [None]:
# importing liabraries
import pandas as pd

In [None]:
#  importing and disabling warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# setting options to show maximum of row and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# importing dataset into dataframe
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files = os.path.join(dirname, filename)
        print(files)

In [None]:
# importing datasets
df=pd.read_csv('/kaggle/input/amazon-deals-todays-deals/raw_Data_Amazon _ Deals.csv')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# fining missing values in the df
df.isnull().sum()

In [None]:
# Renaming some of the columns for better understanding because scrapping has been done on element_names
df = df.rename(columns={'Image': 'image_url', 'alinknormal_URL':'product_url','Label':'price_discount_group', 
                        'Time':'deal_time_limit', 'Content':'product_type', 'Title':'product_name','Title_URL':'brand_url', 
                        'asizebase':'brand_name','aiconalt':'rating_outof5', 'asizesmall':'rated_by', 'Price':'discount',
                        'aoffscreen':'disc_price', 'Keywords':'list_price', 'asizemini2':'log_type' })

In [None]:
df.info()

In [None]:
# splitting the columns to get some meaningful insights
df.insert(18, "listed_price", df["list_price"].str.split(":").str[1])
df['ratings']=df["rating_outof5"].str.split(" ").str[0]

In [None]:
# Selecting only the necessary and meaningful columns as they are required for our analysis
df = df.loc[:, ['image_url', 'product_url', 'price_discount_group', 'product_type', 'product_name', 'brand_url', 
                'brand_name', 'ratings', 'rated_by', 'discount', 'disc_price', 'listed_price', 'log_type']]

In [None]:
# removing prefixing and suffixing white spaces from all columns
df = df.apply(lambda x: x.str.strip())

In [None]:
# Handling replacements in string columns
df['product_type'] = df['product_type'].str.replace("Save on ", "", case=False, regex=False)
df['rated_by'] = df['rated_by'].str.replace(",", "", case=False, regex=False)
df['disc_price'] = df['disc_price'].str.replace(",", "", case=False, regex=False)
df['listed_price'] = df['listed_price'].str.replace(",", "", case=False, regex=False)
df['discount'] = df['discount'].str.replace("-", "", case=False, regex=False)
df['disc_price'] = df['disc_price'].str.replace("AED ", "", case=False, regex=False)
df['listed_price'] = df['listed_price'].str.replace("AED ", "", case=False, regex=False)
df['rated_by'] = df['rated_by'].str.replace("(AED 0.13/gram)", "0", case=False, regex=False)

In [None]:
# Change column type to float64 for columns: 'ratings', 'rated_by', 'disc_price', and 'listed_price'
df = df.astype({'ratings': 'float64','disc_price': 'float64' , 'listed_price': 'float64', 'rated_by': 'float64'})

In [None]:
#drop the rows where 'product_name' is null
df.dropna(subset=['product_name'], inplace=True)
#filling the null values in 'listed_price' column with the corresponding values in 'disc_price' column
df["listed_price"].fillna(df["disc_price"], inplace=True)
# replacing the nulls in 'brand_name' column with unknown
df["brand_name"].fillna("Unknown", inplace=True)
# replace the missing values in 'log_type' with 'PAID Freight'
df['log_type'].fillna("PAID Freight", inplace=True)
# fill the missing values of 'ratings' column with average of corresponding 'product_type' column
df["ratings"].fillna(df.groupby("product_type")["ratings"].transform("mean"), inplace=True)
# replacing the missing values in 'rated_by' column with 0
df = df.fillna({'rated_by': 0})

In [None]:
# filling the null values in 'discount' column with 0%
df["discount"].fillna("0%", inplace=True)
#checking if there are any rows where 'discount' is 0% but 'listed_price' is different than 'disc_price'
df_0_disc = df[(df["discount"] == "0%") & (df["listed_price"] != df["disc_price"])]
#checking if there are any rows where 'discount' is not equal to 0% but 'listed_price' is same as 'disc_price'
df_not_0_disc = df[(df["discount"] != "0%") & (df["listed_price"] == df["disc_price"])]
print(len(df_not_0_disc), len(df_0_disc))

In [None]:
#finding duplicates
df[df.duplicated()].count()

In [None]:
#dropping duplicates
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.head()

In [None]:
df.to_csv('amazon_today_deals.csv')  # saving the clean data to a csv file