In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from scipy import stats
from matplotlib.pyplot import figure
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

warnings.filterwarnings("ignore")
%matplotlib inline  

## Content Based Recomemndations

We try to related articles based on their category, garment group and description and try to build a sparse matrix mapping out their similarities. This is a non personalized recommendation but understanding similar products (articles in this case) is necessary to build a model.

In [2]:
path = "../files/"
articles_df = pd.read_csv(path + "articles.csv",
                          dtype={
                            "article_id": int,
                            "product_code": int,
                            "product_type_no": int,
                            "graphical_appearance_no": int,
                            "index_group_no": int,
                            "section_no": int,
                            "garment_group_no": int
                          })
articles_df.dropna(inplace = True)
articles_df.shape

(105126, 25)

In [4]:
articles_df['index_group_name'].value_counts()

Ladieswear       39523
Baby/Children    34619
Divided          15086
Menswear         12539
Sport             3359
Name: index_group_name, dtype: int64

## Similarities within Groups
Considering the Dataset is too large and similarities finding is very costly operation letz try to ascertain similarities 
within groups of the dataset using cosine similarity

In [5]:
Groups = articles_df['index_group_name'].value_counts().index
similarity_sparse_matrix = []
# Iterate groups from the H&M
for group in Groups:
    # Fetch the Articles Pertaining to the group
    df = articles_df[articles_df['index_group_name'] == group]
    # Concatenate the cotent for the particular product
    df['content_for_analyis'] = df['product_type_name'] + ' ' + articles_df['product_group_name'] + ' ' + articles_df['graphical_appearance_name'] + ' ' + articles_df['colour_group_name'] + ' ' + articles_df['department_name'] + ' ' + articles_df['section_name'] + ' ' + articles_df['garment_group_name']
    # Build the Dataframe for content based similarity index
    item_keywords_df = df[['article_id', 'content_for_analyis']]
    # TF IDF Vectorization
    vectorizer = TfidfVectorizer(min_df=3, max_df=0.7, stop_words=text.ENGLISH_STOP_WORDS)
    # Do a TD IDF Analysis
    X = vectorizer.fit_transform(item_keywords_df['content_for_analyis'])
    # Get the Vectorized_data
    vectorized_data = vectorizer.get_feature_names_out()
    # Print 
    print(group)
    print("-----")
    print("FeatureSet")
    print(vectorized_data)
    # Build the TF IDF Data Frame
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    tfidf_df.index = df['article_id']
    # Compute the Cosine Similarities
    cosine_similarity_array = cosine_similarity(tfidf_df)
    # Append it to the top level cosine similarilty array
    similarity_sparse_matrix.append(cosine_similarity_array)
    

Ladieswear
-----
FeatureSet
['3d' 'accessories' 'alice' 'application' 'asia' 'assortment' 'bag' 'bags'
 'ballerinas' 'band' 'basic' 'basics' 'beachwear' 'beanie' 'beige' 'belt'
 'belts' 'big' 'bikini' 'black' 'blazer' 'blazers' 'blocking' 'blouse'
 'blouses' 'blue' 'body' 'bodysuit' 'bootie' 'boots' 'bottoms' 'bra'
 'bracelet' 'brim' 'bronze' 'brown' 'campaigns' 'cap' 'cardigan' 'care'
 'case' 'casual' 'chambray' 'check' 'clean' 'clip' 'coat' 'collaborations'
 'collection' 'collections' 'colour' 'conscious' 'contrast' 'copper'
 'corset' 'covers' 'dark' 'denim' 'divided' 'dog' 'dot' 'dress' 'dressed'
 'dresses' 'dungarees' 'earring' 'earrings' 'edition' 'embroidery' 'eq'
 'equatorial' 'everyday' 'exclusive' 'expressive' 'extended' 'external'
 'fancy' 'flat' 'flats' 'flip' 'flop' 'functional' 'furniture' 'garment'
 'glittering' 'gloves' 'gold' 'gown' 'green' 'greenish' 'grey' 'greyish'
 'hair' 'hat' 'hats' 'head' 'heeled' 'heels' 'hoodie' 'inactive' 'items'
 'jacket' 'jackets' 'jacquard'

In [6]:
len(similarity_sparse_matrix[0]), len(similarity_sparse_matrix[0][0])

(39523, 39523)

In [7]:
# Finding the similarity in products - Processing Group "LadiesWear"
group = 'Ladieswear'
Ladieswear_df = pd.DataFrame(data=similarity_sparse_matrix[0], 
                             index=articles_df[articles_df['index_group_name'] == group]['article_id'].values,
                             columns=articles_df[articles_df['index_group_name'] == group]['article_id'].values)
Ladieswear_df

Unnamed: 0,108775015,108775044,108775051,110065001,110065002,110065011,111565001,111565003,111586001,111593001,...,949143001,949198001,949323002,949594001,952267001,952937003,952938001,953763001,956217002,959461001
108775015,1.000000,0.953124,0.897246,0.056032,0.016275,0.015723,0.137086,0.122848,0.188705,0.137086,...,0.112254,0.853226,0.122585,0.294175,0.128689,0.277282,0.322724,0.605449,0.375911,0.314123
108775044,0.953124,1.000000,0.942233,0.016313,0.079269,0.015545,0.117260,0.121460,0.163706,0.117260,...,0.072736,0.807360,0.079430,0.290852,0.109054,0.274150,0.319079,0.546502,0.318456,0.395606
108775051,0.897246,0.942233,1.000000,0.000000,0.060415,0.000000,0.104837,0.108593,0.147420,0.104837,...,0.054290,0.757597,0.059286,0.258739,0.104285,0.262162,0.305126,0.501808,0.283294,0.357597
110065001,0.056032,0.016313,0.000000,1.000000,0.943576,0.911566,0.102496,0.043474,0.077920,0.102496,...,0.060237,0.057052,0.065780,0.022536,0.093525,0.000000,0.000000,0.082062,0.083795,0.024064
110065002,0.016275,0.079269,0.060415,0.943576,1.000000,0.899182,0.081071,0.042883,0.051798,0.081071,...,0.017496,0.016571,0.019107,0.022229,0.072328,0.000000,0.000000,0.023836,0.024339,0.116933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952937003,0.277282,0.274150,0.262162,0.000000,0.000000,0.102482,0.000000,0.052091,0.023792,0.000000,...,0.039796,0.282333,0.188088,0.478565,0.039079,1.000000,0.921080,0.513146,0.819842,0.799552
952938001,0.322724,0.319079,0.305126,0.000000,0.000000,0.105267,0.000000,0.053507,0.024439,0.000000,...,0.081627,0.290008,0.237700,0.491573,0.040142,0.921080,1.000000,0.582607,0.727472,0.709468
953763001,0.605449,0.546502,0.501808,0.082062,0.023836,0.023027,0.038374,0.011705,0.073190,0.038374,...,0.164402,0.374841,0.179532,0.536488,0.026932,0.513146,0.582607,1.000000,0.666221,0.572866
956217002,0.375911,0.318456,0.283294,0.083795,0.024339,0.023513,0.039185,0.011952,0.074736,0.039185,...,0.125006,0.382758,0.292798,0.547819,0.027501,0.819842,0.727472,0.666221,1.000000,0.896760


In [8]:
# Finding the similarity in products
grouped_dataframes = []
# Processing Group wise
Groups = articles_df['index_group_name'].value_counts().index
# Iterate groups from the H&M
for x, group in enumerate(Groups):
    df = pd.DataFrame(data=similarity_sparse_matrix[x], 
                                 index=articles_df[articles_df['index_group_name'] == group]['article_id'].values,
                                 columns=articles_df[articles_df['index_group_name'] == group]['article_id'].values)
    grouped_dataframes.append(df)

In [9]:
grouped_dataframes[2]

Unnamed: 0,162074062,162074069,162074071,181160009,181448022,181448102,181448103,181448104,181448105,181448106,...,943674001,944233001,945995002,946095001,946387001,946475001,947253001,949551001,949551002,957375001
162074062,1.000000,0.834122,0.849635,0.000000,0.858904,0.849185,0.861063,0.786352,0.807063,0.816563,...,0.000000,0.000000,0.063042,0.000000,0.0,0.000000,0.000000,0.000000,0.067170,0.000000
162074069,0.834122,1.000000,0.963029,0.000000,0.940795,0.886660,0.899062,0.861327,0.884012,0.894418,...,0.000000,0.000000,0.048991,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
162074071,0.849635,0.963029,1.000000,0.000000,0.977561,0.903150,0.915783,0.877346,0.918558,0.911053,...,0.000000,0.039107,0.049902,0.000000,0.0,0.017758,0.017758,0.031879,0.000000,0.017623
181160009,0.000000,0.000000,0.000000,1.000000,0.009709,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.017395,0.019706,0.000000,0.000000,0.0,0.133770,0.133770,0.016064,0.015481,0.132750
181448022,0.858904,0.940795,0.977561,0.009709,1.000000,0.913002,0.925773,0.886916,0.928578,0.920991,...,0.021097,0.063432,0.000000,0.000000,0.0,0.028804,0.028804,0.051708,0.018775,0.028584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946475001,0.000000,0.000000,0.017758,0.133770,0.028804,0.000000,0.000000,0.000000,0.016868,0.000000,...,0.019443,0.058459,0.000000,0.000000,0.0,1.000000,1.000000,0.047654,0.017303,0.760600
947253001,0.000000,0.000000,0.017758,0.133770,0.028804,0.000000,0.000000,0.000000,0.016868,0.000000,...,0.019443,0.058459,0.000000,0.000000,0.0,1.000000,1.000000,0.047654,0.017303,0.760600
949551001,0.000000,0.000000,0.031879,0.016064,0.051708,0.000000,0.000000,0.000000,0.030281,0.000000,...,0.072611,0.147662,0.827110,0.033216,0.0,0.047654,0.047654,1.000000,0.912323,0.047291
949551002,0.067170,0.000000,0.000000,0.015481,0.018775,0.071401,0.072399,0.000000,0.000000,0.000000,...,0.069976,0.079271,0.910457,0.032011,0.0,0.017303,0.017303,0.912323,1.000000,0.017171


In [10]:
# Now we have made super easy find relationship between two articles
# For example the relationship of score of article "162074071" to "181448022" is as easy as
grouped_dataframes[2].loc[162074071][181448022]

0.9775607980545375

In [20]:
arr1 = ['python3.6' , 'python2' ,'python3']
arr1.remove('python2')
arr1


['python3.6', 'python3']

In [23]:
similar_count = 10
tbl = []
for i, group in enumerate(Groups):
    for j, item in enumerate(grouped_dataframes[i].index):
        top_10_matched_items = grouped_dataframes[i][item].nlargest(n=10).index
        top10 = ",".join(str(v) for v in (list(top_10_matched_items[top_10_matched_items != item])))
        row = [item, top10]
        tbl.append(row)
    # Temporary Status Update    
    print('Processing Complete --- ', group)
        
article_similarity_df = pd.DataFrame(data=tbl, columns=['article_id', 'similar_articles'])
article_similarity_df


Processing Complete ---  Ladieswear
Processing Complete ---  Baby/Children
Processing Complete ---  Divided
Processing Complete ---  Menswear
Processing Complete ---  Sport


Unnamed: 0,article_id,similar_articles
0,108775015,"218354001,372008001,494030013,538699001,565379..."
1,108775044,"218354021,355307042,372008010,494030010,494030..."
2,108775051,"565379021,218354045,108775044,218354021,355307..."
3,110065001,"153115019,253448001,383152002,497513001,562914..."
4,110065002,"153115021,253448002,508318008,679285008,699570..."
...,...,...
105121,942053001,"769667001,931349001,769683001,769684001,872686..."
105122,942062001,"769667001,942053001,931349001,769683001,769684..."
105123,942128001,"506410009,506411010,506410006,506411006,706204..."
105124,942987001,"805928002,879588001,880159001,498738034,645023..."


In [22]:
# Persist File to the file system
article_similarity_df.to_csv(path + 'article-content-similarity.csv')