In [39]:
import re
import nltk
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;-]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
Number_RE = re.compile('[*^0-9]')
Bad_underline = re.compile('[*_*]')
RemoveTag = re.compile('&lt;|br&gt;|b&gt;|ul&gt;|li&gt;|lt|gt|brgt|ligt|ul|b|p|div')

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = RemoveTag.sub('',text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = Number_RE.sub(' ', text) # replace Number symbols by space in text
    text = Bad_underline.sub(' ', text) # replace Underline symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [40]:
def classNumberThreshold(arr):
    dropCategory = []

    for key,value in arr.items():
        if(value<=30):
            dropCategory.append(key)
    return dropCategory

In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../example/walmartNewData.csv')
df.head()

Unnamed: 0,productName,shortDescription,longDescription,itemId,category,subcategory,sub2category,categoryName,subcategoryName,sub2categoryName
0,Rose Cottage Girls' Hunter Green Jacket Dress,"Paired with its' trendy jacket or worn alone, ...",Sean John Boys Tiger Frame Graphic T-Shirt,107,5438.0,7712430.0,7809949.0,Clothing,Kids Clothing,Boys Clothing
1,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,637,5438.0,133197.0,6127105.0,Clothing,Men,Mens Jeans
2,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,638,5438.0,133197.0,6127105.0,Clothing,Men,Mens Jeans
3,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,642,5438.0,133197.0,6127105.0,Clothing,Men,Mens Jeans
4,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,643,5438.0,133197.0,6127105.0,Clothing,Men,Mens Jeans


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138401 entries, 0 to 138400
Data columns (total 10 columns):
productName         138401 non-null object
shortDescription    138401 non-null object
longDescription     138401 non-null object
itemId              138401 non-null int64
category            135674 non-null float64
subcategory         135674 non-null float64
sub2category        134701 non-null float64
categoryName        138401 non-null object
subcategoryName     135674 non-null object
sub2categoryName    134703 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 10.6+ MB


In [43]:
df = df[pd.notnull(df['longDescription'])]
df = df[pd.notnull(df['category'])]
df = df[pd.notnull(df['subcategory'])]
df = df[pd.notnull(df['sub2category'])]
df = df[pd.notnull(df['itemId'])]

In [44]:
df['category'] = df['category'].apply(int)
df['subcategory'] = df['subcategory'].apply(int)
df['sub2category'] = df['sub2category'].apply(int)

In [45]:
df.head()

Unnamed: 0,productName,shortDescription,longDescription,itemId,category,subcategory,sub2category,categoryName,subcategoryName,sub2categoryName
0,Rose Cottage Girls' Hunter Green Jacket Dress,"Paired with its' trendy jacket or worn alone, ...",Sean John Boys Tiger Frame Graphic T-Shirt,107,5438,7712430,7809949,Clothing,Kids Clothing,Boys Clothing
1,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,637,5438,133197,6127105,Clothing,Men,Mens Jeans
2,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,638,5438,133197,6127105,Clothing,Men,Mens Jeans
3,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,642,5438,133197,6127105,Clothing,Men,Mens Jeans
4,Wrangler Men's Relaxed Fit Jeans,These Wrangler Men's Relaxed Fit Jeans are gre...,&lt;br&gt;&lt;b&gt;Wrangler Men's Relaxed Fit ...,643,5438,133197,6127105,Clothing,Men,Mens Jeans


In [46]:
le = LabelEncoder()
le.fit(df.category)
target = le.classes_
labels = le.transform(df.category)

le.fit(df.subcategory)
subtarget = le.classes_
sublabels = le.transform(df.subcategory)

le.fit(df.sub2category)
sub2target = le.classes_
sub2labels = le.transform(df.sub2category)

In [47]:
categorydict = dict(df.groupby(['category','categoryName']).groups.keys()) 
subcategorydict = dict(df.groupby(['subcategory','subcategoryName']).groups.keys()) 
targetName = [categorydict[ele] for ele in target]
subTargetName = [subcategorydict[ele] for ele in subtarget]

In [48]:
y = pd.DataFrame({'id':df.itemId,'category':labels, 'subcategory':sublabels,'sub2category':sub2labels,'description':df.longDescription})

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.longDescription, y, test_size=0.2, random_state = 27)

In [50]:
pd.DataFrame({'X_train':X_train}).to_csv("Walmart/X_train.csv",index=False,compression=None)

In [51]:
pd.DataFrame({'X_test':X_test}).to_csv("Walmart/X_test.csv",index=False,compression=None)

In [52]:
y_train.to_csv("Walmart/y_train.csv",index=False,compression=None)

In [53]:
y_test.to_csv("Walmart/y_test.csv",index=False,compression=None)

In [54]:
pd.DataFrame({'target':target,'targetName':targetName,'newLabel':np.unique(labels)}).to_csv("Walmart/label.csv",index=False,compression=None)
pd.DataFrame({'subtarget':subtarget,'subTargetName':subTargetName,'newSubLabel':np.unique(sublabels)}).to_csv("Walmart/sublabel.csv",index=False,compression=None)
pd.DataFrame({'sub2target':sub2target,'newSub2Label':np.unique(sub2labels)}).to_csv("Walmart/sub2label.csv",index=False,compression=None)


In [55]:
import pandas as pd
import numpy as np

df = pd.read_csv('../example/fliptkart.csv')
df.head()

Unnamed: 0,uniq_id,product_url,product_name,pid,description,category_main,category_sub1,category_sub2
0,c2d766ca982eca8304150849735ffef9,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,SRTEH2FF9KEDEFGF,Key Features of Alisha Solid Women's Cycling S...,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear"
1,7f7036a6d550aaa89d34c77bd39a5e48,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,Furniture,Living Room Furniture,Sofa Beds & Futons
2,f449ec65dcbc041b6ae5e6a32717d01b,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,SHOEH4GRSUBJGZXE,Key Features of AW Bellies Sandals Wedges Heel...,Footwear,Women's Footwear,Ballerinas
3,0973b37acd0c664e3de26e97e5571454,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,SRTEH2F6HUZMQ6SJ,Key Features of Alisha Solid Women's Cycling S...,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear"
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,PSOEH3ZYDMSYARJ5,Specifications of Sicons All Purpose Arnica Do...,Pet Supplies,Grooming,Skin & Coat Care


In [56]:
df = df[pd.notnull(df['description'])]
CleanText = df['description'].apply(clean_text)

In [57]:
df = df[pd.notnull(df['category_main'])]
df = df[pd.notnull(df['pid'])]

In [58]:
dropCategoryCode  = classNumberThreshold(df.category_main.value_counts())
dropSubCategoryCode  = classNumberThreshold(df.category_sub1.value_counts())
dropSub2CategoryCode  = classNumberThreshold(df.category_sub2.value_counts())

for i in dropCategoryCode:
    df = df[df.category_main!=i]
    
for i in dropSubCategoryCode:
    df = df[df.category_sub1!=i]

for i in dropSub2CategoryCode:
    df = df[df.category_sub2!=i]

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15487 entries, 0 to 19997
Data columns (total 8 columns):
uniq_id          15487 non-null object
product_url      15487 non-null object
product_name     15487 non-null object
pid              15487 non-null object
description      15487 non-null object
category_main    15487 non-null object
category_sub1    15487 non-null object
category_sub2    15487 non-null object
dtypes: object(8)
memory usage: 1.1+ MB


In [60]:
le = LabelEncoder()
le.fit(df.category_main)
target = le.classes_
labels = le.transform(df.category_main)

le.fit(df.category_sub1)
subtarget = le.classes_
sublabels = le.transform(df.category_sub1)

le.fit(df.category_sub2)
sub2target = le.classes_
sub2labels = le.transform(df.category_sub2)

In [61]:
# categorydict = dict(df.groupby(['category','categoryName']).groups.keys()) 
# subcategorydict = dict(df.groupby(['subcategory','subcategoryName']).groups.keys()) 
# targetName = [categorydict[ele] for ele in target]
# subTargetName = [subcategorydict[ele] for ele in subtarget]

In [62]:
y = pd.DataFrame({'id':df.pid,'category':labels, 'subcategory':sublabels,'sub2category':sub2labels,'description':df.description})

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.description, y, test_size=0.2, random_state = 27)

In [64]:
pd.DataFrame({'X_train':X_train}).to_csv("FlipKart/X_train.csv",index=False,compression=None)

In [65]:
pd.DataFrame({'X_test':X_test}).to_csv("FlipKart/X_test.csv",index=False,compression=None)

In [66]:
y_train.to_csv("FlipKart/y_train.csv",index=False,compression=None)

In [67]:
y_test.to_csv("FlipKart/y_test.csv",index=False,compression=None)

In [68]:
pd.DataFrame({'target':target,'newLabel':np.unique(labels)}).to_csv("FlipKart/label.csv",index=False,compression=None)
pd.DataFrame({'subtarget':subtarget,'newSubLabel':np.unique(sublabels)}).to_csv("FlipKart/sublabel.csv",index=False,compression=None)
pd.DataFrame({'sub2target':sub2target,'newSub2Label':np.unique(sub2labels)}).to_csv("FlipKart/sub2label.csv",index=False,compression=None)


In [69]:
y_train.shape

(12389, 5)

In [70]:
df = pd.read_csv('../example/amazon_co-ecommerce_sample.csv')
df.head()

Unnamed: 0,uniq_id,product_name,manufacturer,price,number_available_in_stock,number_of_reviews,number_of_answered_questions,average_review_rating,amazon_category_and_sub_category,customers_who_bought_this_item_also_bought,description,product_information,product_description,items_customers_buy_after_viewing_this_item,customer_questions_and_answers,customer_reviews,sellers
0,eac7efa5dbd3d667f26eb3d3ab504464,Hornby 2014 Catalogue,Hornby,£3.42,5 new,15,1.0,4.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R8150-Catalogue...,Product Description Hornby 2014 Catalogue Box ...,Technical Details Item Weight640 g Product Dim...,Product Description Hornby 2014 Catalogue Box ...,http://www.amazon.co.uk/Hornby-R8150-Catalogue...,Does this catalogue detail all the previous Ho...,Worth Buying For The Pictures Alone (As Ever) ...,"{""seller""=>[{""Seller_name_1""=>""Amazon.co.uk"", ..."
1,b17540ef7e86e461d37f3ae58b7b72ac,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,£16.99,,2,1.0,4.5 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Christmas-Holiday-Expr...,Size Name:Large FunkyBuys® Large Christmas Hol...,Technical Details Manufacturer recommended age...,Size Name:Large FunkyBuys® Large Christmas Hol...,http://www.amazon.co.uk/Christmas-Holiday-Expr...,can you turn off sounds // hi no you cant turn...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,"{""seller""=>{""Seller_name_1""=>""UHD WHOLESALE"", ..."
2,348f344247b0c1a935b1223072ef9d8a,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,£9.99,2 new,17,2.0,3.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Classic-Train-Lights-B...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,Technical Details Manufacturer recommended age...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,http://www.amazon.co.uk/Train-With-Tracks-Batt...,What is the gauge of the track // Hi Paul.Trut...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,"{""seller""=>[{""Seller_name_1""=>""DEAL-BOX"", ""Sel..."
3,e12b92dbb8eaee78b22965d2a9bbbd9f,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,£39.99,,1,2.0,5.0 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...,Technical Details Item Weight259 g Product Dim...,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...,,,I love it // 5.0 // 22 July 2013 // By\n \n...,
4,e33a9adeed5f36840ccc227db4682a36,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,£32.19,,3,2.0,4.7 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R6367-RailRoad-...,Product Description Hornby RailRoad 0-4-0 Gild...,Technical Details Item Weight159 g Product Dim...,Product Description Hornby RailRoad 0-4-0 Gild...,http://www.amazon.co.uk/Hornby-R2672-RailRoad-...,,Birthday present // 5.0 // 14 April 2014 // By...,


In [71]:
df = df[pd.notnull(df['amazon_category_and_sub_category'])]
df = df[pd.notnull(df['description'])]
df = df[pd.notnull(df['uniq_id'])]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8692 entries, 0 to 9998
Data columns (total 17 columns):
uniq_id                                        8692 non-null object
product_name                                   8692 non-null object
manufacturer                                   8691 non-null object
price                                          7484 non-null object
number_available_in_stock                      6601 non-null object
number_of_reviews                              8683 non-null object
number_of_answered_questions                   8045 non-null float64
average_review_rating                          8683 non-null object
amazon_category_and_sub_category               8692 non-null object
customers_who_bought_this_item_also_bought     7807 non-null object
description                                    8692 non-null object
product_information                            8639 non-null object
product_description                            8692 non-null object
items_cu

In [72]:
category = []
subcategory = []
sub2category = []
for ele in df['amazon_category_and_sub_category'].apply(lambda x: str(x).split('>')):
    category.append(ele[0])
    if(len(ele)>2):
        subcategory.append(ele[1])
        sub2category.append(ele[2])
    elif(len(ele)>1):
        subcategory.append(ele[1])
        sub2category.append(np.NaN)
    else:
        subcategory.append(np.NaN)
        sub2category.append(np.NaN)

data= {'uniq_id':df['uniq_id'], 'product_name':df['product_name'],'category_main':category,'category_sub1':subcategory,'category_sub2':sub2category,'description':df['description']}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,uniq_id,product_name,category_main,category_sub1,category_sub2,description
0,eac7efa5dbd3d667f26eb3d3ab504464,Hornby 2014 Catalogue,Hobbies,Model Trains & Railway Sets,Rail Vehicles,Product Description Hornby 2014 Catalogue Box ...
1,b17540ef7e86e461d37f3ae58b7b72ac,FunkyBuys® Large Christmas Holiday Express Fes...,Hobbies,Model Trains & Railway Sets,Rail Vehicles,Size Name:Large FunkyBuys® Large Christmas Hol...
2,348f344247b0c1a935b1223072ef9d8a,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,Hobbies,Model Trains & Railway Sets,Rail Vehicles,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...
3,e12b92dbb8eaee78b22965d2a9bbbd9f,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hobbies,Model Trains & Railway Sets,Rail Vehicles,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...
4,e33a9adeed5f36840ccc227db4682a36,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hobbies,Model Trains & Railway Sets,Rail Vehicles,Product Description Hornby RailRoad 0-4-0 Gild...


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8692 entries, 0 to 9998
Data columns (total 6 columns):
uniq_id          8692 non-null object
product_name     8692 non-null object
category_main    8692 non-null object
category_sub1    8692 non-null object
category_sub2    4969 non-null object
description      8692 non-null object
dtypes: object(6)
memory usage: 475.3+ KB


In [74]:
df.description = df['description'].apply(clean_text)

In [75]:
df = df[pd.notnull(df['category_main'])]

In [76]:
dropCategoryCode  = classNumberThreshold(df.category_main.value_counts())
dropSubCategoryCode  = classNumberThreshold(df.category_sub1.value_counts())
dropSub2CategoryCode  = classNumberThreshold(df.category_sub2.value_counts())

for i in dropCategoryCode:
    df = df[df.category_main!=i]
    
for i in dropSubCategoryCode:
    df = df[df.category_sub1!=i]


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8166 entries, 0 to 9998
Data columns (total 6 columns):
uniq_id          8166 non-null object
product_name     8166 non-null object
category_main    8166 non-null object
category_sub1    8166 non-null object
category_sub2    4827 non-null object
description      8166 non-null object
dtypes: object(6)
memory usage: 446.6+ KB


In [78]:
le = LabelEncoder()
le.fit(df.category_main)
target = le.classes_
labels = le.transform(df.category_main)

le.fit(df.category_sub1.apply(str))
subtarget = le.classes_
sublabels = le.transform(df.category_sub1)



In [79]:
y = pd.DataFrame({'id':df.uniq_id,'category':labels, 'subcategory':sublabels,'description':df.description})

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.description, y, test_size=0.2, random_state = 27)

In [81]:
pd.DataFrame({'X_train':X_train}).to_csv("Amazon/X_train.csv",index=False,compression=None)

In [82]:
pd.DataFrame({'X_test':X_test}).to_csv("Amazon/X_test.csv",index=False,compression=None)

In [83]:
y_train.to_csv("Amazon/y_train.csv",index=False,compression=None)

In [84]:
y_test.to_csv("Amazon/y_test.csv",index=False,compression=None)

In [85]:
pd.DataFrame({'target':target,'newLabel':np.unique(labels)}).to_csv("Amazon/label.csv",index=False,compression=None)
pd.DataFrame({'subtarget':subtarget,'newSubLabel':np.unique(sublabels)}).to_csv("Amazon/sublabel.csv",index=False,compression=None)
# pd.DataFrame({'sub2target':sub2target,'newSub2Label':np.unique(sub2labels)}).to_csv("FlipKart/sub2label.csv",index=False,compression=None)


In [86]:
df = pd.read_csv('../example/combine.csv')
df.head()

Unnamed: 0,uniq_id,product_name,description,category_main,category_sub1,category_sub2,category,subcategory
0,c2e71e7c97489d4efa5eab796ae354ac,Zyxel VMG1312-B10A VDSL2 Wireless N VDSL2 4-po...,Buy Zyxel VMG1312-B10A VDSL2 Wireless N VDSL2 ...,computers,network components,routers,400000,400200
1,8d5196bb4b2ad74b557768e9f666c21c,Zyxel PLA4231 500 Mbps Powerline Wireless N Ex...,Buy Zyxel PLA4231 500 Mbps Powerline Wireless ...,computers,network components,routers,400000,400200
2,833a87b0af3b84fe246d2ebf271af177,Zyxel PLA-4205,Buy Zyxel PLA-4205 only for Rs. 8100 from Flip...,computers,network components,routers,400000,400200
3,9c41fc08bd9e99ba7550816f00c9b9bd,Zyxel p-661h-d1,Buy Zyxel p-661h-d1 only for Rs. 4042 from Fli...,computers,network components,routers,400000,400200
4,f3f433714d8c535d28d0663c021e3769,Zyxel NWA1100N 802.11 b/g/n PoE Access Point,Buy Zyxel NWA1100N 802.11 b/g/n PoE Access Poi...,computers,network components,routers,400000,400200


In [87]:
def checkNanin(text):
    text = str(text)
    if(text=="nan"):
        return ""
    else:
        return text
df.description = df.description.apply(checkNanin)    

In [88]:
def checkNanin(text):
    text = str(text)
    if(text=="nan"):
        print(text)
df.description.apply(checkNanin)    

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
28660    None
28661    None
28662    None
28663    None
28664    None
28665    None
28666    None
28667    None
28668    None
28669    None
28670    None
28671    None
28672    None
28673    None
28674    None
28675    None
28676    None
28677    None
28678    None
28679    None
28680    None
28681    None
28682    None
28683    None
28684    None
28685    None
28686    None
28687    None
28688    None
28689    None
Name: description, Length: 28690, dtype: object

In [89]:
df.description = df['description'].apply(clean_text)

In [90]:
df = df[pd.notnull(df['category'])]

df = df[pd.notnull(df['uniq_id'])]
df = df[pd.notnull(df['description'])]
df = df.dropna(subset=['description'])

In [91]:
dropCategoryCode  = classNumberThreshold(df.category.value_counts())
dropSubCategoryCode  = classNumberThreshold(df.subcategory.value_counts())

for i in dropCategoryCode:
    df = df[df.category!=i]
    
for i in dropSubCategoryCode:
    df = df[df.subcategory!=i]


In [92]:
le = LabelEncoder()
le.fit(df.category)
target = le.classes_
labels = le.transform(df.category)

le.fit(df.subcategory)
subtarget = le.classes_
sublabels = le.transform(df.subcategory)



In [93]:
y = pd.DataFrame({'id':df.uniq_id,'category':labels, 'subcategory':sublabels, 'description':df.description})

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.description, y, test_size=0.2, random_state = 27)

In [95]:
X_train.shape, X_test.shape

((22832,), (5708,))

In [96]:
count = 0 
for i in pd.isnull(X_train).values:
    if(i):
        print(count)
    count = count +1

In [97]:
Xtrain = pd.DataFrame({'X_train':X_train})
Xtest = pd.DataFrame({'X_test':X_test})

In [98]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22832 entries, 2469 to 5159
Data columns (total 1 columns):
X_train    22832 non-null object
dtypes: object(1)
memory usage: 356.8+ KB


In [99]:
Xtrain.to_csv("Combine/X_train.csv",index=False,compression=None)

In [100]:
Xtest.to_csv("Combine/X_test.csv",index=False,compression=None)

In [101]:
y_train.to_csv("Combine/y_train.csv",index=False,compression=None)

In [102]:
y_test.to_csv("Combine/y_test.csv",index=False,compression=None)

In [103]:
pd.DataFrame({'target':target,'newLabel':np.unique(labels)}).to_csv("Combine/label.csv",index=False,compression=None)
pd.DataFrame({'subtarget':subtarget,'newSubLabel':np.unique(sublabels)}).to_csv("Combine/sublabel.csv",index=False,compression=None)
# pd.DataFrame({'sub2target':sub2target,'newSub2Label':np.unique(sub2labels)}).to_csv("FlipKart/sub2label.csv",index=False,compression=None)
