In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pylab as plt

In [45]:
nike_df = pd.read_csv("dataset/nike_2020_04_13.csv")
adidas_df = pd.read_csv("dataset/Adidas final.csv", delimiter=";")

In [46]:
useful_cols = ["Product Name", "Sale Price", "Brand", "Description","Product ID"]
my_nike  = nike_df[useful_cols]
my_adidas = adidas_df[useful_cols]

In [47]:
def checkNull(df, features):
    for col in features:
        print(col + " " + str(df[col].isnull().sum()))
        
def fillNull(df, features):
    for feature in useful_cols:
        my_nike[feature] = my_nike[feature].fillna('')
        
def combined_features(row):
    combine_cols = ["Product Name", "Sale Price", "Brand", "Description"]
    res = ""
    for feature in combine_cols:
        res += str(row[feature]) + " "
    return res

In [48]:
checkNull(my_nike, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 3
Product ID 0


In [49]:
checkNull(my_adidas, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 0
Product ID 0


In [50]:
fillNull(my_nike, useful_cols)
fillNull(my_adidas, useful_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike[feature] = my_nike[feature].fillna('')


In [51]:
my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)


In [52]:
my_products = pd.concat([my_nike, my_adidas], ignore_index=True)

In [11]:
my_products[my_products['Product ID'] == '554724-050']

Unnamed: 0,Product Name,Sale Price,Brand,Description,Product ID,combined_features
404,Air Jordan 1 Mid,9995,Nike,The Air Jordan 1 Mid Shoe is inspired by the f...,554724-050,Air Jordan 1 Mid 9995 Nike The Air Jordan 1 Mi...


In [25]:
cv = CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(my_products["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(my_products["combined_features"])
len(vectorizer.get_feature_names())

4750

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit(my_products["combined_features"])
vector_spaces = tf_vectorizer.transform(my_products["combined_features"])

In [28]:
vector_spaces.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
len(vector_spaces.toarray())

3268

In [30]:
tf_cosine_sim = cosine_similarity(vector_spaces)
tf_products = list(enumerate(tf_cosine_sim[404]))
tf_sorted_similar_products = sorted(tf_products, key=lambda x:x[1], reverse=True)

array([[1.        , 0.54875467, 0.20991891, ..., 0.01053225, 0.00228553,
        0.01513736],
       [0.54875467, 1.        , 0.23965497, ..., 0.00684998, 0.        ,
        0.        ],
       [0.20991891, 0.23965497, 1.        , ..., 0.01695567, 0.00190687,
        0.00311748],
       ...,
       [0.01053225, 0.00684998, 0.01695567, ..., 1.        , 0.04749854,
        0.06568628],
       [0.00228553, 0.        , 0.00190687, ..., 0.04749854, 1.        ,
        0.03572508],
       [0.01513736, 0.        , 0.00311748, ..., 0.06568628, 0.03572508,
        1.        ]])

In [31]:
tf_sorted_similar_products[1:11]

[(404, 1.0000000000000002),
 (39, 0.49897003709323373),
 (208, 0.4318161541951472),
 (499, 0.4318161541951472),
 (257, 0.4281775005907048),
 (125, 0.4203462924630129),
 (189, 0.3888941047211936),
 (492, 0.3888941047211936),
 (322, 0.38702470367500424),
 (561, 0.38702470367500424)]

In [32]:
cosine_sim = cosine_similarity(count_matrix)

In [33]:
len(cosine_sim)

3268

array([[1.        , 0.63305416, 0.40657856, ..., 0.04767313, 0.02635231,
        0.02076137],
       [0.63305416, 1.        , 0.42276002, ..., 0.02624319, 0.        ,
        0.        ],
       [0.40657856, 0.42276002, 1.        , ..., 0.06460957, 0.02380952,
        0.01875806],
       ...,
       [0.04767313, 0.02624319, 0.06460957, ..., 1.        , 0.25125945,
        0.29692784],
       [0.02635231, 0.        , 0.02380952, ..., 0.25125945, 1.        ,
        0.21884405],
       [0.02076137, 0.        , 0.01875806, ..., 0.29692784, 0.21884405,
        1.        ]])

In [41]:
similar_shoes = list(enumerate(cosine_sim[404]))

In [42]:
sorted_similar_shoes = sorted(similar_shoes, key=lambda x:x[1], reverse=True)

In [43]:
sorted_similar_shoes[0:9]

[(43, 0.9999999999999998),
 (404, 0.9999999999999998),
 (39, 0.5443310539518172),
 (208, 0.5352643613280605),
 (499, 0.5352643613280605),
 (257, 0.513239353660981),
 (463, 0.5055250296034367),
 (125, 0.5034965460952285),
 (551, 0.47140452079103157)]

In [22]:
for a in sorted_similar_shoes[0:9]:
    print(a[0])
    print(my_products.iloc[a[0]]["Product ID"])

2
CI3482-200
516
AR5339-002
35
CJ1642-002
172
CQ0492-001
416
CD4366-002
139
CT1020-001
438
AV4417-002
496
CQ6566-700
389
BQ3611-100


In [24]:
cosine_sim

NameError: name 'cosine_sim' is not defined

In [23]:
def produce_rank():
    res = []
    for csl in cosine_sim:
        single = []
        ssl = list(enumerate(csl))
        sss = sorted(ssl, key=lambda x:x[1], reverse=True)
        for a in sss[0:11]:
            single.append(my_products.iloc[a[0]]["Product ID"])
        res.append(single)
    return res

In [24]:
lst = produce_rank()
lst
# ans = pd.DataFrame(lst, columns =["productId", "s_1", "s_2", "s_3", "s_4", "s_5", "s_6", "s_7", "s_8"])

[['CJ1646-600',
  'CT4328-101',
  'CQ0492-001',
  'AO2132-401',
  '315115-112',
  '315123-111',
  '366731-100',
  'AA0287-002',
  'CJ1379-001',
  'CD0888-002',
  'CI3446-001'],
 ['CT4328-101',
  'CQ0492-001',
  'CJ1379-001',
  '315123-111',
  '366731-100',
  'AA0287-002',
  '315115-112',
  'CD0887-201',
  'CD0888-002',
  'CJ1646-600',
  'CI3446-001'],
 ['CI3482-200',
  'AR5339-002',
  'CJ1642-002',
  'CQ0492-001',
  'CD4366-002',
  'CT1020-001',
  'AV4417-002',
  'CQ6566-700',
  'BQ3611-100',
  'CT2584-100',
  'CK4126-001'],
 ['CD0479-200',
  'CI3898-200',
  'CJ0636-100',
  'BQ9665-301',
  'AQ4312-107',
  'CI1214-004',
  'AR7410-603',
  'CD6615-100',
  'CQ2503-900',
  'AH6789-023',
  'CI3709-001'],
 ['CZ6156-101',
  'CI9842-500',
  'CK7200-800',
  'AQ0927-100',
  'CI3868-001',
  'AO2924-401',
  'CI3870-100',
  'AR9293-603',
  'AH6789-023',
  'CI3709-001',
  'CD0132-001'],
 ['CJ6314-146',
  'CJ0861-017',
  'CJ0861-010',
  'CD8526-446',
  'AQ8306-407',
  'AQ7489-008',
  'AO2982-076',
  '

In [25]:
ans = pd.DataFrame(lst, columns =["productId", "s_1", "s_2", "s_3", "s_4", "s_5", "s_6", "s_7", "s_8", "s_9", "s_10"])

In [26]:
ans

Unnamed: 0,productId,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10
0,CJ1646-600,CT4328-101,CQ0492-001,AO2132-401,315115-112,315123-111,366731-100,AA0287-002,CJ1379-001,CD0888-002,CI3446-001
1,CT4328-101,CQ0492-001,CJ1379-001,315123-111,366731-100,AA0287-002,315115-112,CD0887-201,CD0888-002,CJ1646-600,CI3446-001
2,CI3482-200,AR5339-002,CJ1642-002,CQ0492-001,CD4366-002,CT1020-001,AV4417-002,CQ6566-700,BQ3611-100,CT2584-100,CK4126-001
3,CD0479-200,CI3898-200,CJ0636-100,BQ9665-301,AQ4312-107,CI1214-004,AR7410-603,CD6615-100,CQ2503-900,AH6789-023,CI3709-001
4,CZ6156-101,CI9842-500,CK7200-800,AQ0927-100,CI3868-001,AO2924-401,CI3870-100,AR9293-603,AH6789-023,CI3709-001,CD0132-001
...,...,...,...,...,...,...,...,...,...,...,...
3263,EF3503,EF3504,EF3505,CL7439,CK9427,CL7437,CL7438,EG0963,CL7511,CK9688,CK9686
3264,EF3505,EF3509,EF3504,CL7439,CK9427,CL7415,CL7413,CL7411,CL7437,CL7438,EG3493
3265,CM6007,CM6008,EF3509,CM0009,CM0010,CM0008,CJ0183,CK9785,CM0011,CK9787,CK1083
3266,BC0980,CM7531,EH1333,CK1088,CJ0180,CJ0177,CK1086,F99913,BC0973,EG6208,CL9975


In [27]:
ans.to_csv("10_most_recommend.csv", index=False)

In [37]:
my_products.iloc[2]

Product Name                              Nike Air Force 1 Sage Low LX
Sale Price                                                        9995
Brand                                                             Nike
Description          Taking both height and craft to new levels, th...
Product ID                                                  CI3482-200
combined_features    Nike Air Force 1 Sage Low LX 9995 Nike Taking ...
Name: 2, dtype: object

In [38]:
my_products.iloc[516]

Product Name                                 Nike Air Force 1 Sage Low
Sale Price                                                        8995
Brand                                                             Nike
Description          Taking both height and craft to new levels, th...
Product ID                                                  AR5339-002
combined_features    Nike Air Force 1 Sage Low 8995 Nike Taking bot...
Name: 516, dtype: object

In [31]:
my_products.iloc[36]

Product Name                                  Nike Air Force 1 '07 LV8
Sale Price                                                        7595
Brand                                                             Nike
Description          The Nike Air Force 1 '07 LV8 incorporates the ...
Product ID                                                  CD0888-002
combined_features    Nike Air Force 1 '07 LV8 7595 Nike The Nike Ai...
Name: 36, dtype: object

In [32]:
my_products.iloc[0]

Product Name                            Nike Air Force 1 '07 Essential
Sale Price                                                        7495
Brand                                                             Nike
Description          Let your shoe game shimmer in the Nike Air For...
Product ID                                                  CJ1646-600
combined_features    Nike Air Force 1 '07 Essential 7495 Nike Let y...
Name: 0, dtype: object

In [33]:
my_products.iloc[257]

Product Name                                            Jordan Max 200
Sale Price                                                       10495
Brand                                                             Nike
Description          With design elements inspired by the Air Jorda...
Product ID                                                  CD6105-007
combined_features    Jordan Max 200 10495 Nike With design elements...
Name: 257, dtype: object