In [1]:
import pandas as pd

product_df = pd.read_csv("data/product_info.csv")
product_counts = (
    product_df["product_name"]
    .value_counts()
    .rename_axis("product_name")
    .reset_index(name="count")
)

duplicates = product_counts[product_counts["count"] > 1]
print(f"Total rows: {len(product_df):,}")
print(f"Unique product names: {product_counts.shape[0]:,}")
print(f"Repeated product names: {duplicates.shape[0]}")

duplicates


Total rows: 8,494
Unique product names: 8,415
Repeated product names: 74


Unnamed: 0,product_name,count
0,Discovery Set,3
1,Hand Cream,3
2,Hand Wash,3
3,Mini Perfume Oil Set,3
4,Fragrance Discovery Set,3
...,...,...
69,Perfume Sampler Set,2
70,Cream Bronzer,2
71,Lip Liner,2
72,Lip Gloss,2


In [2]:
rating_missing = product_df["rating"].isna()
reviews_missing = product_df["reviews"].isna()

same_missing_mask = rating_missing.equals(reviews_missing)
print(
    "All rows share the same missing status for rating and reviews?",
    same_missing_mask,
)

mismatched_rows = product_df[rating_missing ^ reviews_missing][
    ["product_id", "product_name", "rating", "reviews"]
]
print(f"Rows where only one of rating/reviews is missing: {len(mismatched_rows)}")

mismatched_rows.head()


All rows share the same missing status for rating and reviews? True
Rows where only one of rating/reviews is missing: 0


Unnamed: 0,product_id,product_name,rating,reviews


clean process

In [3]:
import pandas as pd

def clean_data(df):
    # Filter rows based on column: 'rating'
    df = df[df['rating'].notna()]
    # Drop column: 'size'
    df = df.drop(columns=['size'])
    # Drop column: 'variation_type'
    df = df.drop(columns=['variation_type'])
    # Drop column: 'variation_value'
    df = df.drop(columns=['variation_value'])
    # Drop column: 'variation_desc'
    df = df.drop(columns=['variation_desc'])
    # Filter rows based on column: 'ingredients'
    df = df[df['ingredients'].notna()]
    # Drop column: 'value_price_usd'
    df = df.drop(columns=['value_price_usd'])
    # Drop column: 'sale_price_usd'
    df = df.drop(columns=['sale_price_usd'])
    # Sort by column: 'secondary_category' (ascending)
    df = df.sort_values(['secondary_category'])
    # Filter rows based on column: 'secondary_category'
    df = df[df['secondary_category'].notna()]
    # Drop column: 'tertiary_category'
    df = df.drop(columns=['tertiary_category'])
    # Drop column: 'child_max_price'
    df = df.drop(columns=['child_max_price'])
    # Drop column: 'child_min_price'
    df = df.drop(columns=['child_min_price'])
    return df

# Loaded variable 'df' from URI: /Users/julialopezpinot/Desktop/visual_analytics_final_project/data/product_info.csv
df = pd.read_csv(r'/Users/julialopezpinot/Desktop/visual_analytics_final_project/data/product_info.csv')

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,ingredients,price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,child_count
6575,P445724,Triple Action Cleansing Water - Cleanse + Purify,3902,SEPHORA COLLECTION,17037,4.131,145.0,"['Aqua (Water), Butylene Glycol, Glycerin, Pro...",12.0,0,0,0,0,1,"['Vegan', 'Good for: Pores', 'Clean + Planet P...",Makeup,Accessories,2
5538,P476892,NUDESKIN Citrus Clean Balm & Make-Up Melt,7055,NUDESTIX,602,5.0,1.0,"['Isodecyl Neopentanoate, Isononyl Isononanoat...",35.0,0,0,0,0,0,"['Good for: Dullness/Uneven Texture', 'Vitamin...",Makeup,Accessories,0
7708,P504898,Mini Pink Reusable MakeUp Eraser,8000,The Original MakeUp Eraser,3710,5.0,2.0,['100% Polyester.'],10.0,0,1,1,1,0,"['Fragrance Free', 'Hypoallergenic', 'Alcohol ...",Makeup,Accessories,0
7707,P500018,Smiley MakeUp Eraser 7-Day Set,8000,The Original MakeUp Eraser,5553,4.875,8.0,['100% Polyester.'],25.0,0,0,1,0,0,,Makeup,Accessories,0
7705,P504702,Juicy Optimism 7 Piece MakeUp Eraser Set,8000,The Original MakeUp Eraser,7268,4.5,10.0,['100% Polyester.'],25.0,1,1,0,0,1,,Makeup,Accessories,0
