In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sparse_dot_topn import awesome_cossim_topn

In [2]:
df = pd.read_csv('superstore_final.csv',encoding= 'unicode_escape')

In [3]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Category,Sub-Category,ProductName,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Rating
0,42433,AG-2011-2040,1/1/2014,6/1/2014,Standard Class,8,Toby Braunhardt,Consumer,Constantine,Constantine,...,Office Supplies,Storage,"Tenex Lockers, Blue",408.3,2,0.0,106.14,35.46,Medium,4.0
1,22253,IN-2011-47883,1/1/2014,8/1/2014,Standard Class,9,Joseph Holt,Consumer,Wagga Wagga,New South Wales,...,Office Supplies,Storage,"Tenex Lockers, Blue",120.366,3,0.1,36.036,9.72,Medium,4.5
2,48883,HU-2011-1220,1/1/2014,5/1/2014,Second Class,12,Annie Thurman,Consumer,Budapest,Budapest,...,Office Supplies,Storage,"Tenex Lockers, Blue",66.12,4,0.0,29.64,8.17,High,4.0
3,11731,IT-2011-3647632,1/1/2014,5/1/2014,Second Class,20,Eugene Moren,Home Office,Stockholm,Stockholm,...,Office Supplies,Storage,"Tenex Lockers, Blue",44.865,3,0.5,-26.055,4.82,High,4.0
4,22255,IN-2011-47883,1/1/2014,8/1/2014,Standard Class,24,Patrick O'Donnell,Consumer,Dhaka,Dhaka,...,Office Supplies,Storage,"Tenex Lockers, Blue",113.67,5,0.1,37.77,4.7,Medium,4.0


In [4]:
product_list=df['ProductID'].unique()
products = pd.DataFrame()
products['ProductID'] = product_list
products

Unnamed: 0,ProductID
0,1
1,2
2,3
3,4
4,5
...,...
411,436
412,437
413,438
414,439


In [5]:
vals=df['ProductName'].unique().astype('U')
vals

array(['Tenex Lockers, Blue', 'Acme Trimmer, High Speed',
       'Tenex Box, Single Width', 'Enermax Note Cards, Premium',
       'Eldon Light Bulb, Duo Pack',
       'Eaton Computer Printout Paper, 8.5 x 11',
       'Brother Personal Copier, Laser',
       'Sauder Facets Collection Library, Sky Alder Finish',
       'Fellowes Lockers, Wire Frame', 'Tenex Trays, Single Width',
       'KitchenAid Coffee Grinder, Red',
       'Hamilton Beach Refrigerator, Silver',
       'Advantus Photo Frame, Erganomic',
       'Binney & Smith Pencil Sharpener, Easy-Erase',
       'Kleencut Ruler, High Speed', 'Memorex Memory Card, USB',
       'Sanford Pencil Sharpener, Water Color',
       'Wilson Jones Hole Reinforcements, Economy',
       'Smead File Folder Labels, Adjustable',
       'GlobeWeis Peel and Seal, Set of 50',
       'Acco Hole Reinforcements, Recycled',
       'Avery Hole Reinforcements, Durable',
       'Wilson Jones Hole Reinforcements, Clear',
       'Fellowes File Cart, Industrial',

In [6]:
def ngrams_analyzer(string):
        string = re.sub(r'[,-./]', r'', string)
        ngrams = zip(*[string[i:] for i in range(5)]) #N-gram length
        return [''.join(ngram) for ngram in ngrams]

In [7]:
#constructing vectorizer
vectorizer=TfidfVectorizer(analyzer=ngrams_analyzer)

In [8]:
#Build the matrix
tfidf_matrix=vectorizer.fit_transform(vals)

In [11]:
tfidf_matrix[:10,:]

<10x4062 sparse matrix of type '<class 'numpy.float64'>'
	with 239 stored elements in Compressed Sparse Row format>

In [41]:
cosine_matrix = awesome_cossim_topn(tfidf_matrix, tfidf_matrix.transpose(), vals.size,0.25)

In [42]:
# Instaniate our lookup hash table
group_lookup = {}

In [34]:
def find_group(row, col):
    # If either the row or the col string have already been given
    # a group, return that group. Otherwise return none
    if row in group_lookup:
        return group_lookup[row]
    elif col in group_lookup:
        return group_lookup[col]
    else:
        return None

In [35]:
def add_vals_to_lookup(group, row, col):
    # Once we know the group name, set it as the value
    # for both strings in the group_lookup
    group_lookup[row] = group
    group_lookup[col] = group


In [36]:
def add_pair_to_lookup(row, col):
    # in this function we'll add both the row and the col to the lookup
    group = find_group(row, col)  # first, see if one has already been added
    if group is not None:
        # if we already know the group, make sure both row and col are in lookup
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)

In [43]:
# Build a coordinate matrix
coo_matrix = cosine_matrix.tocoo()

In [44]:
coo_matrix

<413x413 sparse matrix of type '<class 'numpy.float64'>'
	with 2563 stored elements in COOrdinate format>

In [45]:
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        add_pair_to_lookup(vals[row], vals[col])

df['Group'] = df['ProductName'].map(group_lookup).fillna(df['ProductName'])

In [46]:
group_lookup

{'Tenex Lockers, Blue': 'Tenex Box, Single Width',
 'Tenex Lockers, Single Width': 'Tenex Box, Single Width',
 'Rogers Lockers, Wire Frame': 'Tenex Box, Single Width',
 'Fellowes Lockers, Wire Frame': 'Tenex Box, Single Width',
 'Fellowes Lockers, Single Width': 'Tenex Box, Single Width',
 'Fellowes Lockers, Industrial': 'Tenex Box, Single Width',
 'Acme Trimmer, High Speed': 'Acme Scissors, Easy Grip',
 'Acme Trimmer, Steel': 'Acme Trimmer, High Speed',
 'Acme Box Cutter, High Speed': 'Acme Scissors, Easy Grip',
 'Kleencut Ruler, High Speed': 'Acme Scissors, Easy Grip',
 'Acme Scissors, High Speed': 'Acme Scissors, Easy Grip',
 'Stiletto Shears, High Speed': 'Acme Scissors, Easy Grip',
 'Fiskars Trimmer, Serrated': 'Acme Trimmer, High Speed',
 'Tenex Box, Single Width': 'Tenex Box, Single Width',
 'Rogers Box, Single Width': 'Tenex Box, Single Width',
 'Eldon Box, Single Width': 'Tenex Box, Single Width',
 'Fellowes Box, Single Width': 'Tenex Box, Single Width',
 'Tenex Trays, Single 

In [54]:
df.groupby(['ProductName','Group']).count()['ProductID']

ProductName                                      Group                                                           
3M Hangers With Command Adhesive                 3M Hangers With Command Adhesive                                      1
9-3/4 Diameter Round Wall Clock                  9-3/4 Diameter Round Wall Clock                                      16
Acco 3-Hole Punch, Recycled                      Wilson Jones Hole Reinforcements, Economy                             2
Acco 6 Outlet Guardian Premium Surge Suppressor  Acco 6 Outlet Guardian Premium Surge Suppressor                       1
Acco Binder Covers, Durable                      Wilson Jones Hole Reinforcements, Economy                           135
                                                                                                                    ... 
Xerox Computer Printout Paper, 8.5 x 11          Eaton Computer Printout Paper, 8.5 x 11                               5
Xerox Memo Slips, Multicolor           

In [66]:
df[df["ProductName"]=='Acco Binder Covers, Durable'].groupby(['Group']).count()

Unnamed: 0_level_0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Category,Sub-Category,ProductName,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Rating
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Wilson Jones Hole Reinforcements, Economy",135,135,135,135,135,135,135,135,135,135,...,135,135,135,135,135,135,135,135,135,135


In [67]:
df[df["Group"]=='Wilson Jones Hole Reinforcements, Economy'].groupby(['ProductName']).count()

Unnamed: 0_level_0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Category,Sub-Category,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Rating,Group
ProductName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Acco 3-Hole Punch, Recycled",2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Acco Binder Covers, Durable",135,135,135,135,135,135,135,135,135,135,...,135,135,135,135,135,135,135,135,135,135
"Acco Binding Machine, Economy",21,21,21,21,21,21,21,21,21,21,...,21,21,21,21,21,21,21,21,21,21
"Acco Binding Machine, Recycled",123,123,123,123,123,123,123,123,123,123,...,123,123,123,123,123,123,123,123,123,123
"Acco Hole Reinforcements, Recycled",362,362,362,362,362,362,362,362,362,362,...,362,362,362,362,362,362,362,362,362,362
"Acco Index Tab, Clear",33,33,33,33,33,33,33,33,33,33,...,33,33,33,33,33,33,33,33,33,33
"Ames Mailers, Recycled",12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
"Avery 3-Hole Punch, Recycled",55,55,55,55,55,55,55,55,55,55,...,55,55,55,55,55,55,55,55,55,55
"Avery Binder Covers, Economy",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Avery Binder, Recycled",52,52,52,52,52,52,52,52,52,52,...,52,52,52,52,52,52,52,52,52,52


In [75]:
output_df=df[['ProductName','Group']]

In [79]:
output_df=output_df.drop_duplicates().reset_index()

In [85]:
output_df=output_df[['ProductName','Group']].sort_values(by='Group')
output_df

Unnamed: 0,ProductName,Group
128,3M Hangers With Command Adhesive,3M Hangers With Command Adhesive
42,"Tenex Clock, Black",9-3/4 Diameter Round Wall Clock
169,9-3/4 Diameter Round Wall Clock,9-3/4 Diameter Round Wall Clock
252,"Howard Miller 13"" Diameter Pewter Finish Round...",9-3/4 Diameter Round Wall Clock
304,"Linden 10"" Round Wall Clock, Black",9-3/4 Diameter Round Wall Clock
...,...,...
195,Xerox 216,Xerox 205
367,"SanDisk Parchment Paper, Recycled","Xerox Parchment Paper, Premium"
58,"Xerox Parchment Paper, Premium","Xerox Parchment Paper, Premium"
274,Southworth Parchment Paper & Envelopes,"Xerox Parchment Paper, Premium"


In [None]:
output_df.to_csv('output_df.csv')