In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("description_data.csv")
category_matching = pd.read_csv("category_matching_modified.csv")

sub_category_dict = category_matching.groupby("main_category")["sub_category"].unique().to_dict()


In [15]:
sub_category_dict

{'Other': array(['Xbox 360 Games, Consoles & Accessories',
        'Vacuum Cleaners & Floor Care',
        'PlayStation Vita Games, Consoles & Accessories',
        'Wii U Games, Consoles & Accessories',
        'PlayStation 4 Games, Consoles & Accessories',
        'Shaving & Hair Removal Products', 'Fabric Decorating',
        'Video Games', 'Automotive Tires & Wheels',
        'Wellness & Relaxation Products', 'Computer Monitors',
        'Printmaking Supplies', 'Tablet Replacement Parts',
        'Computer External Components',
        'Nintendo Switch Consoles, Games & Accessories',
        'Party Decorations', 'Mac Games & Accessories', 'Bedding',
        'PC Games & Accessories', 'Wall Art', 'Vehicle Electronics',
        'Games & Accessories', 'Gift Wrapping Supplies', 'Fasteners',
        'Vision Products', 'Arts, Crafts & Sewing Storage',
        'Science Education Supplies', 'Material Handling Products',
        'eBook Readers & Accessories', 'Lab & Scientific Products',
   

In [19]:
def find_best_subcategory(title, main_category):
    sub_categories = sub_category_dict.get(main_category, [])
    if len(sub_categories) == 0:
        return "Other"
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sub_categories)
    title_vector = vectorizer.transform([title])
    similarities = cosine_similarity(title_vector, tfidf_matrix)
    best_match_idx = similarities.argmax()
    best_match_score = similarities.max()
    return sub_categories[best_match_idx] if best_match_score > 0.6 else "Other"

In [20]:
df["sub_category"] = df.apply(lambda row: find_best_subcategory(row["TITLE"], row["main_category"]), axis=1)

In [21]:
df

Unnamed: 0,TITLE,DESCRIPTION,main_category,sub_category
0,Drill America DWCTL Series High-Speed Steel En...,"DRILL AMERICA 7/8"" HS 2FSE LONG ENDMILL (DWCTL...",home & kitchen,Other
1,Whats Your Kick® Chess Lover Inspired Printed ...,This mug is made up of fine quality ceramic ma...,home & kitchen,Other
2,TravisLappy Keyboard for Dell Inspiron 3541 35...,Travislappy keyboard for dell inspiron 3541 35...,music,Other
3,Korky 528PRO Pro Grade Max Performance Fill Va...,FILL VALVES,appliances,Other
4,Kristin Ess Weightless Shine Leave-In Conditioner,Kristin Ess Weightless Shine Leave-In Conditioner,beauty & health,Other
...,...,...,...,...
199995,Fariox Plastic Wall Mounted Cosmetic Organizer...,"Could be set up in kitchen, bathroom, living r...",home & kitchen,Data Storage
199996,MADSABRE 12.5 in Handmade Forged Optimal Full ...,"""Features:<br> 100% Quality Assurance&nbsp;<br...",Other,Outdoor Recreation
199997,MNTC Lord Vishnu Beautiful Paper Poster (Paper...,Mntc beautiful paper print poster (size 12 inc...,home & kitchen,Other
199998,Just Love Womens Solid Jacket 4501-NEW-CRL-S,<b> STAY COMFY AND WARM WITH SCRUB JACKETS DES...,women's clothing,Other


In [25]:
df[df["sub_category"] != "Other"]

Unnamed: 0,TITLE,DESCRIPTION,main_category,sub_category
11,Adhi Shree Fashion Women's Rayon Printed Anark...,<p><strong>Adhi Shree Fashion</strong></p><p>A...,women's clothing,Women's Clothing
13,WZHKIDS Kids Boys Girls Adjustable Straps Snea...,"Breathable casual shoes, comfortable, new fash...",kids' fashion,Baby Girls' Clothing & Shoes
22,GRANDVILLA Combo of Baby All in One Washable R...,<p><strong>About Product &amp; Brand:-</strong...,toys & baby products,Baby
31,Nike Kids Boy's Dry Short Sleeve Training T-Sh...,Nike Kids Boy's Dry Short Sleeve Training T-Sh...,Other,Toilet Training Products
36,Milumia Women's Floral Drawstring Front Bathin...,0,women's clothing,Women's Clothing
...,...,...,...,...
199985,CIPRAMO SPORTS Liteweight Casual Fashionable L...,CIPRAMO shoes are designed to keeping in mind ...,men's shoes,Men's Shoes
199987,VOWEL Twin Bell Table Metal Alarm Clock for He...,<p><strong>VOWEL TWIN BELL</strong></p> <p><st...,home & kitchen,Light Bulbs
199988,STORE99® Women Solid Bandanas Microfiber Tubul...,Women Solid Bandanas Microfiber Tubular Multif...,Other,Outdoor Recreation
199995,Fariox Plastic Wall Mounted Cosmetic Organizer...,"Could be set up in kitchen, bathroom, living r...",home & kitchen,Data Storage


In [29]:
df["TITLE"][199995]

'Fariox Plastic Wall Mounted Cosmetic Organizer Storage Shelf , Multicolour'

In [30]:
df.to_csv("description_data_with_subcategory.csv", index=False)