#### Importing Data

In [1]:
import kagglehub
import shutil
import os

# Specify the save path
save_path = "C:/Users/Jerico/Documents/gitRepositories/Dirty-E-Commerce-Data-Project/Source Files"

# Download latest version
path = kagglehub.dataset_download("oleksiimartusiuk/e-commerce-data-shein")

print("Path to dataset files:", path)

# Check if folder exists
try:
    if os.path.exists(save_path):
        # Remove the existing directory
        shutil.rmtree(save_path)
        print(f"Existing files at {save_path} removed.")

    # Move the downloaded files to the specified path
    if os.path.exists(path):
        shutil.move(path, save_path)
        print(f"Dataset moved to {save_path}")
    else:
        print("Download path not found!")
except FileNotFoundError as e:
    print(f"Error: The specified file or directory was not found: {e}")


  from .autonotebook import tqdm as notebook_tqdm


Resuming download from 0 bytes (3611849 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/oleksiimartusiuk/e-commerce-data-shein?dataset_version_number=4 (0/3611849) bytes left.


100%|██████████| 3.44M/3.44M [00:01<00:00, 3.07MB/s]

Extracting files...





Path to dataset files: C:\Users\Jerico\.cache\kagglehub\datasets\oleksiimartusiuk\e-commerce-data-shein\versions\4
Existing files at C:/Users/Jerico/Documents/gitRepositories/Dirty-E-Commerce-Data-Project/Source Files removed.
Dataset moved to C:/Users/Jerico/Documents/gitRepositories/Dirty-E-Commerce-Data-Project/Source Files


### Exploratory Data Analysis

In [2]:
import pandas as pd

csv_file1 = "./source_files/us-shein-appliances-3987.csv"

shen_appliances = pd.read_csv(csv_file1)

print(shen_appliances.head(1))


                              goods-title-link--jump  \
0  1pc Rechargeable Deep Tissue Muscle Handheld M...   

                         goods-title-link--jump href       rank-title  \
0  https://us.shein.com/1pc-Rechargeable-Deep-Tis...  #1 Best Sellers   

        rank-sub  price discount selling_proposition goods-title-link  
0  in Give Gifts  $2.03     -22%                 NaN              NaN  


In [3]:
csv_file2 = "./source_files/us-shein-automotive-4110.csv"
shein_automotive = pd.read_csv(csv_file2)

print(shein_automotive.head(1))


                              goods-title-link--jump  \
0  1pc/2PCS Car Cup Coaster, Bling Cup Holder Ins...   

                         goods-title-link--jump href  price discount  \
0  https://us.shein.com/1pc-2PCS-Car-Cup-Coaster-...  $0.68     -60%   

  selling_proposition goods-title-link  
0                 NaN              NaN  


In [4]:
# Load all the csv files with their filename as the dataframe name
def load_all_csv_files(folder_path):
    dataframes = {}
    # Get only the csv files
    for filename in os.listdir(folder_path):
        if filename.endswith("csv"):
            file_path = os.path.join(folder_path, filename)

            # filename as dataframe name
            df_name = filename.replace(".csv", "")

            # Read the csv
            dataframes[df_name] = pd.read_csv(file_path)

            # Print the first row in the dataframe to confirm
            print(f"Data from {df_name}: \n", dataframes[df_name].head(1))

    return dataframes


# Enter folder path
folder_path = "./source_files"

# Use load_all_csv_files
dataframes = load_all_csv_files(folder_path)


Data from us-shein-appliances-3987: 
                               goods-title-link--jump  \
0  1pc Rechargeable Deep Tissue Muscle Handheld M...   

                         goods-title-link--jump href       rank-title  \
0  https://us.shein.com/1pc-Rechargeable-Deep-Tis...  #1 Best Sellers   

        rank-sub  price discount selling_proposition goods-title-link  
0  in Give Gifts  $2.03     -22%                 NaN              NaN  
Data from us-shein-automotive-4110: 
                               goods-title-link--jump  \
0  1pc/2PCS Car Cup Coaster, Bling Cup Holder Ins...   

                         goods-title-link--jump href  price discount  \
0  https://us.shein.com/1pc-2PCS-Car-Cup-Coaster-...  $0.68     -60%   

  selling_proposition goods-title-link  
0                 NaN              NaN  
Data from us-shein-baby_and_maternity-4433: 
                               goods-title-link--jump  \
0  Baby Boys' Dinosaur Printed Vest And Letter Pr...   

                     

### Data Cleaning

In [5]:
dataframe_names = list(dataframes.keys())

print(dataframe_names)


['us-shein-appliances-3987', 'us-shein-automotive-4110', 'us-shein-baby_and_maternity-4433', 'us-shein-bags_and_luggage-4299', 'us-shein-beauty_and_health-4267', 'us-shein-curve-2849', 'us-shein-electronics-4395', 'us-shein-home_and_kitchen-3719', 'us-shein-home_textile-3883', 'us-shein-jewelry_and_accessories-3548', 'us-shein-kids-4314', 'us-shein-mens_clothes-1891', 'us-shein-office_and_school_supplies-4233', 'us-shein-pet_supplies-4083', 'us-shein-shoes-4381', 'us-shein-sports_and_outdoors-3853', 'us-shein-swimwear-3761', 'us-shein-tools_and_home_improvement-3903', 'us-shein-toys_and_games-3577', 'us-shein-underwear_and_sleepwear-4019', 'us-shein-womens_clothing-4620']


In [6]:
excluded_columns = [
    "goods-title-link--jump href",
    "goods-title-link--jump",
    "rank-title",
    "rank-sub",
    "color-count",
    "blackfridaybelts-bg src",
    "blackfridaybelts-content",
    "product-locatelabels-img src",
]


In [7]:
# Function to remove the excluded columns for each dataframe
def remove_columns(df: pd.DataFrame, excluded_columns: list) -> pd.DataFrame:
    # Get all the columns:
    for column in df.columns:
        # Get the column names
        if column in excluded_columns:
            # Drop the column
            df.drop(column, axis=1, inplace=True)
            print(f"Column '{column}' dropped.")

    return df


In [8]:
# iterate on all of the dataframes in the dictionary
for df_name in dataframes:
    df = dataframes[df_name]
    print(f"Cleaning {df_name}")

    # Use function to remove excluded columns
    df_cleaned = remove_columns(df, excluded_columns)

    # Update the dataframe with the cleaned version
    dataframes[df_name] = df_cleaned

for df_name in dataframes:
    print(f"\n DataFrame {df_name}")
    print(dataframes[df_name].head())


Cleaning us-shein-appliances-3987
Column 'goods-title-link--jump' dropped.
Column 'goods-title-link--jump href' dropped.
Column 'rank-title' dropped.
Column 'rank-sub' dropped.
Cleaning us-shein-automotive-4110
Column 'goods-title-link--jump' dropped.
Column 'goods-title-link--jump href' dropped.
Cleaning us-shein-baby_and_maternity-4433
Column 'goods-title-link--jump' dropped.
Column 'goods-title-link--jump href' dropped.
Column 'color-count' dropped.
Cleaning us-shein-bags_and_luggage-4299
Column 'color-count' dropped.
Column 'goods-title-link--jump' dropped.
Column 'goods-title-link--jump href' dropped.
Column 'rank-title' dropped.
Column 'rank-sub' dropped.
Cleaning us-shein-beauty_and_health-4267
Column 'goods-title-link--jump' dropped.
Column 'goods-title-link--jump href' dropped.
Column 'color-count' dropped.
Column 'rank-title' dropped.
Column 'rank-sub' dropped.
Cleaning us-shein-curve-2849
Column 'color-count' dropped.
Column 'goods-title-link--jump' dropped.
Column 'goods-ti

In [10]:
print(dataframes["us-shein-underwear_and_sleepwear-4019"])

     selling_proposition   price discount  \
0     200+ sold recently   $3.39     -70%   
1                    NaN  $20.99      -7%   
2                    NaN  $10.29      NaN   
3      80+ sold recently   $3.74     -65%   
4                    NaN  $13.79      NaN   
...                  ...     ...      ...   
4013                 NaN   $1.91     -40%   
4014  900+ sold recently   $3.99      -7%   
4015  300+ sold recently   $4.89      -6%   
4016  600+ sold recently   $5.99     -20%   
4017  200+ sold recently   $6.59     -15%   

                                       goods-title-link  
0                                                   NaN  
1                                                   NaN  
2                                                   NaN  
3                                                   NaN  
4                                                   NaN  
...                                                 ...  
4013  Men's Bodysuit Lingerie Muscle Men Temptation .