In [1]:
%%capture
!python3 -m pip install -r requirements.txt

In [2]:
import concurrent.futures
import re
import xml.etree.ElementTree as ET
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk import stem
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data Gathering and Indexing

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4.1 Safari/605.1.15"
}

In [4]:
def find_pdp_sitemaps(entry_sitemap):
    response = requests.get(entry_sitemap, headers=headers)

    if response.status_code != 200:
        print("Start sitemap response not 200!")
        return []

    root = ET.fromstring(response.text)
    tag = "{http://www.sitemaps.org/schemas/sitemap/0.9}loc"
    pdp_identifier = "sitemap-pdp"
    return [child.text for child in root.iter(tag) if pdp_identifier in child.text]

In [5]:
entry_sitemap = "https://www.jcrew.com/sitemap-wex/sitemap-index.xml"
pdp_sitemaps = find_pdp_sitemaps(entry_sitemap)
print(f"{len(pdp_sitemaps)=}")
pdp_sitemaps

len(pdp_sitemaps)=15


['https://www.jcrew.com/sitemap-wex/sitemap-pdp1.xml',
 'https://www.jcrew.com/sitemap-wex/sitemap-pdp2.xml',
 'https://www.jcrew.com/sitemap-wex/sitemap-pdp3.xml',
 'https://www.jcrew.com/sitemap-wex/sitemap-pdp4.xml',
 'https://www.jcrew.com/sitemap-wex/sitemap-pdp5.xml',
 'https://www.jcrew.com/sitemap-wex/au/sitemap-pdp1.xml',
 'https://www.jcrew.com/sitemap-wex/au/sitemap-pdp2.xml',
 'https://www.jcrew.com/sitemap-wex/au/sitemap-pdp3.xml',
 'https://www.jcrew.com/sitemap-wex/au/sitemap-pdp4.xml',
 'https://www.jcrew.com/sitemap-wex/au/sitemap-pdp5.xml',
 'https://www.jcrew.com/sitemap-wex/ca/sitemap-pdp1.xml',
 'https://www.jcrew.com/sitemap-wex/ca/sitemap-pdp2.xml',
 'https://www.jcrew.com/sitemap-wex/ca/sitemap-pdp3.xml',
 'https://www.jcrew.com/sitemap-wex/ca/sitemap-pdp4.xml',
 'https://www.jcrew.com/sitemap-wex/ca/sitemap-pdp5.xml']

In [6]:
def get_response(pdp_sitemap, session=None):
    try:
        if session:
            response = session.get(pdp_sitemap, headers=headers)
        else:
            response = requests.get(pdp_sitemap, headers=headers)
    except requests.RequestException:
        return None

    if response.status_code == 200:
        return response
    else:
        return None
    

def get_product_urls(pdp_sitemaps):
    product_urls = []
    
    num_workers = min(len(pdp_sitemaps), 8)
    with (
        requests.Session() as session,
        concurrent.futures.ThreadPoolExecutor(num_workers) as executor
    ):
        futures = [executor.submit(get_response, pdp_sitemap, session) for pdp_sitemap in pdp_sitemaps]
        for future in concurrent.futures.as_completed(futures):
            response = future.result()

            if not response: continue
                
            try:
                root = ET.fromstring(response.text)
                tag = "{http://www.sitemaps.org/schemas/sitemap/0.9}loc"
                product_urls.extend(child.text for child in root.iter(tag))
            except ET.ParseError:
                continue
        
    return product_urls

In [7]:
product_urls_file = Path() / "data" / "product_urls.csv"

if product_urls_file.exists():
    urls = pd.read_csv(product_urls_file)
else:
    product_urls = get_product_urls(pdp_sitemaps)
    urls = pd.DataFrame(product_urls, columns=["url"])
    urls.to_csv(product_urls_file, index=False)
    
print(f"{urls.shape=}")
urls.sample(10)

urls.shape=(63236, 1)


Unnamed: 0,url
22726,https://www.jcrew.com/p/womens/categories/clot...
41609,https://www.jcrew.com/p/womens/categories/acce...
9276,https://www.jcrew.com/au/p/mens/categories/clo...
46121,https://www.jcrew.com/ca/p/womens/categories/c...
44367,https://www.jcrew.com/ca/p/girls/categories/cl...
28513,https://www.jcrew.com/p/womens/categories/clot...
46327,https://www.jcrew.com/ca/p/womens/categories/c...
2055,https://www.jcrew.com/au/p/womens/categories/a...
11208,https://www.jcrew.com/au/p/womens/categories/c...
24733,https://www.jcrew.com/p/boys/categories/clothi...


In [8]:
identifiers = urls["url"].str.extract(r".*(?P<path>/p/.+)$")
identifiers

Unnamed: 0,path
0,/p/AW585
1,/p/BF402
2,/p/BK252
3,/p/BM989
4,/p/BN112
...,...
63231,/p/womens/categories/shoes/sneakers/saturday-s...
63232,/p/womens/categories/shoes/sneakers/saturday-s...
63233,/p/womens/categories/shoes/sneakers/tretorn-ra...
63234,/p/womens/categories/shoes/winter-boots/perfec...


In [9]:
urls = urls.groupby(identifiers["path"]).agg(tuple).reset_index(drop=True)
urls

Unnamed: 0,url
0,"(https://www.jcrew.com/au/p/AQ203, https://www..."
1,"(https://www.jcrew.com/au/p/AQ389, https://www..."
2,"(https://www.jcrew.com/au/p/AS770, https://www..."
3,"(https://www.jcrew.com/au/p/AW585, https://www..."
4,"(https://www.jcrew.com/p/BB116,)"
...,...
24341,(https://www.jcrew.com/p/womens/features/brand...
24342,(https://www.jcrew.com/au/p/womens/features/br...
24343,(https://www.jcrew.com/au/p/womens/features/br...
24344,(https://www.jcrew.com/p/womens/features/brand...


In [10]:
def get_first_response(urls, session=None):
    for url in urls:
        try:
            if session:
                response = session.get(url, headers=headers)
            else:
                response = requests.get(url, headers=headers)
        except request.RequestException:
            continue
            
        if response.status_code == 200:
            return response
        else:
            print(url, response.status_code)
            
    return None


def parse(text):
    soup = BeautifulSoup(text, "html.parser")
    try:
        return soup.find("div", "ProductDetailPage__right___PRY1d").get_text(" ")
    except AttributeError:
        return ""


def get_product_details(urls):
    index_start = urls.index[0]
    urls = urls["url"].to_list()
    
    product_details = []
    
    num_workers = min(len(urls), 8)
    with (
        requests.Session() as session,
        concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor
    ):
        futures = {executor.submit(get_first_response, urls, session) : url_index
                   for url_index, urls in enumerate(urls, start=index_start)}
        for future in concurrent.futures.as_completed(futures):
            url_index = futures[future]
            response = future.result()
            
            if not response:
                product_details.append(["", str(url_index)])
                continue
            
            product_details.append([parse(response.text), str(url_index)])
            
    return product_details

In [11]:
def get_product_details_file(i):
    return Path() / "data" / f"product_details_{i}.csv"

num_partitions = 10
product_details_files = [get_product_details_file(i) for i in range(num_partitions)]
partitions = np.array_split(urls, num_partitions)

for file, partition in zip(product_details_files, partitions):
    if not file.exists():
        product_details = get_product_details(partition)
        details = pd.DataFrame(product_details, columns=["details", "url index"])
        details.to_csv(file, index=False)
        del details

## Data Cleaning and Preprocessing

In [12]:
def load_data(product_details_files):
    df = pd.concat([pd.read_csv(file) for file in product_details_files], ignore_index=True)
    return (df.dropna()
            .sort_values(by="url index")
            .set_index("url index"))

In [13]:
corpus = load_data(product_details_files)
corpus

Unnamed: 0_level_0,details
url index,Unnamed: 1_level_1
0,Essential upstate graphic T-shirt Item AQ203 A...
1,1994 long-sleeve T-shirt in stripe Item AQ389 ...
2,Corgi™ X J.Crew critter socks Item AS770 All s...
3,Peanuts® X J.Crew critter socks Item AW585 All...
4,Grundens® X J.Crew slub cotton graphic T-shirt...
...,...
24341,ISA TAPIA™ Te Amo slides Item L9266 Quantity :...
24342,New Balance ® for J.Crew high waisted performa...
24343,New Balance® for J.Crew Trinamic leggings in s...
24344,Summersalt® classic turn down bikini bottom in...


In [14]:
def preprocess(df):
    return (df.iloc[:, 0]
            .str
            .replace(r"\w*\d\w*", "", regex=True)   # Remove words containing numbers.
            .replace(r"[^a-zA-Z ]", "", regex=True) # Remove special characters.
            .to_frame())

In [15]:
corpus = preprocess(corpus)
corpus

Unnamed: 0_level_0,details
url index,Unnamed: 1_level_1
0,Essential upstate graphic Tshirt Item All siz...
1,longsleeve Tshirt in stripe Item All sizes a...
2,Corgi X JCrew critter socks Item All sizes ar...
3,Peanuts X JCrew critter socks Item All sizes ...
4,Grundens X JCrew slub cotton graphic Tshirt It...
...,...
24341,ISA TAPIA Te Amo slides Item Quantity ...
24342,New Balance for JCrew high waisted performanc...
24343,New Balance for JCrew Trinamic leggings in str...
24344,Summersalt classic turn down bikini bottom in ...


In [16]:
def tokenize_and_stem(df):
    ps = stem.PorterStemmer()
    fn = lambda txt: " ".join(ps.stem(token) for token in tokenize.word_tokenize(txt))
    return df.iloc[:, 0].apply(fn).to_frame()

In [17]:
corpus = tokenize_and_stem(corpus) # Takes a while...
corpus

Unnamed: 0_level_0,details
url index,Unnamed: 1_level_1
0,essenti upstat graphic tshirt item all size ar...
1,longsleev tshirt in stripe item all size are u...
2,corgi x jcrew critter sock item all size are u...
3,peanut x jcrew critter sock item all size are ...
4,grunden x jcrew slub cotton graphic tshirt ite...
...,...
24341,isa tapia te amo slide item quantiti ship to h...
24342,new balanc for jcrew high waist perform leg in...
24343,new balanc for jcrew trinam leg in stripe colo...
24344,summersalt classic turn down bikini bottom in ...


In [18]:
# Expand urls into multiple columns.
urls = pd.DataFrame(urls["url"].values.tolist(), urls.index).fillna(value=np.nan)
urls

Unnamed: 0,0,1,2
0,https://www.jcrew.com/au/p/AQ203,https://www.jcrew.com/p/AQ203,https://www.jcrew.com/ca/p/AQ203
1,https://www.jcrew.com/au/p/AQ389,https://www.jcrew.com/p/AQ389,https://www.jcrew.com/ca/p/AQ389
2,https://www.jcrew.com/au/p/AS770,https://www.jcrew.com/p/AS770,https://www.jcrew.com/ca/p/AS770
3,https://www.jcrew.com/au/p/AW585,https://www.jcrew.com/p/AW585,https://www.jcrew.com/ca/p/AW585
4,https://www.jcrew.com/p/BB116,,
...,...,...,...
24341,https://www.jcrew.com/p/womens/features/brands...,,
24342,https://www.jcrew.com/au/p/womens/features/bra...,https://www.jcrew.com/p/womens/features/brands...,https://www.jcrew.com/ca/p/womens/features/bra...
24343,https://www.jcrew.com/au/p/womens/features/bra...,https://www.jcrew.com/p/womens/features/brands...,https://www.jcrew.com/ca/p/womens/features/bra...
24344,https://www.jcrew.com/p/womens/features/brands...,,


## Document Retrieval

In [19]:
def build_model(corpus):
    vectorizer = TfidfVectorizer(stop_words="english")
    doc_term_matrix = vectorizer.fit_transform(corpus["details"].to_list())
    return vectorizer, doc_term_matrix


def build_query_handler(urls, corpus, vectorizer, doc_term_matrix):
    
    def preprocess_query(query):
        query = re.sub(r"\w*\d\w*", "", query)
        query = re.sub(r"[^a-zA-Z ]", "", query)
        ps = stem.PorterStemmer()
        return " ".join(ps.stem(token) for token in tokenize.word_tokenize(query))
    
    def output_query_result(query, doc_indices):
        top_documents = corpus.iloc[doc_indices]
        
        # Find the url indices corresponding to the top documents.
        top_urls = urls.iloc[top_documents.index]
        
        output = [f"Showing top results for {query=}"]
        for i, (_, row) in enumerate(top_urls.iterrows(), start=1):
            output.append(f"{i}.")
            for entry in row:
                if isinstance(entry, str):
                    output.append(entry)
        return "\n".join(output)
    
    def handle_query(query, k=5):
        query_vec = vectorizer.transform([preprocess_query(query)])
        
        similarity_vec = cosine_similarity(query_vec, doc_term_matrix).flatten()
        
        max_k_indices_unordered = np.argpartition(similarity_vec, -k)[-k:]
        max_k_indices = max_k_indices_unordered[np.argsort(similarity_vec[max_k_indices_unordered])]
        
        return output_query_result(query, max_k_indices)
    
    return handle_query

In [20]:
vectorizer, doc_term_matrix = build_model(corpus)
query_handler = build_query_handler(urls, corpus, vectorizer, doc_term_matrix)

## Queries

In [21]:
print(query_handler("black hoodie"))

Showing top results for query='black hoodie'
1.
https://www.jcrew.com/au/p/mens/categories/clothing/sweatshirts-and-sweatpants/french-terry/garment-dyed-french-terry-hoodie/H4576
https://www.jcrew.com/p/mens/categories/clothing/sweatshirts-and-sweatpants/french-terry/garment-dyed-french-terry-hoodie/H4576
https://www.jcrew.com/ca/p/mens/categories/clothing/sweatshirts-and-sweatpants/french-terry/garment-dyed-french-terry-hoodie/H4576
2.
https://www.jcrew.com/au/p/boys/categories/clothing/sweats/sweatshirts/boys-t-shirt-hoodie-in-stripe/AX803
https://www.jcrew.com/p/boys/categories/clothing/sweats/sweatshirts/boys-t-shirt-hoodie-in-stripe/AX803
https://www.jcrew.com/ca/p/boys/categories/clothing/sweats/sweatshirts/boys-t-shirt-hoodie-in-stripe/AX803
3.
https://www.jcrew.com/au/p/womens/categories/clothing/sweatshirts-and-sweatpants/pullovers/hoodie-sweatshirt-in-camo/K1728
https://www.jcrew.com/p/womens/categories/clothing/sweatshirts-and-sweatpants/pullovers/hoodie-sweatshirt-in-camo/K

In [22]:
print(query_handler("comfy socks"))

Showing top results for query='comfy socks'
1.
https://www.jcrew.com/au/p/boys/categories/accessories/socks/boysapos-ankle-socks-three-pack/BO644
https://www.jcrew.com/p/boys/categories/accessories/socks/boysapos-ankle-socks-three-pack/BO644
https://www.jcrew.com/ca/p/boys/categories/accessories/socks/boysapos-ankle-socks-three-pack/BO644
2.
https://www.jcrew.com/au/p/boys/categories/accessories/accessories-mask/boys-three-pack-of-athletic-socks/BE690
https://www.jcrew.com/p/boys/categories/accessories/accessories-mask/boys-three-pack-of-athletic-socks/BE690
https://www.jcrew.com/ca/p/boys/categories/accessories/accessories-mask/boys-three-pack-of-athletic-socks/BE690
3.
https://www.jcrew.com/au/p/boys/categories/accessories/socks/kids-colorblock-trouser-socks-four-pack/BC772
https://www.jcrew.com/p/boys/categories/accessories/socks/kids-colorblock-trouser-socks-four-pack/BC772
https://www.jcrew.com/ca/p/boys/categories/accessories/socks/kids-colorblock-trouser-socks-four-pack/BC772
4.

In [23]:
print(query_handler("girls sweaters"))

Showing top results for query='girls sweaters'
1.
https://www.jcrew.com/au/p/girls/categories/clothing/sweaters/popovers/girls-ruffle-detail-sweater-in-colorful-stripe/BE571
https://www.jcrew.com/p/girls/categories/clothing/sweaters/popovers/girls-ruffle-detail-sweater-in-colorful-stripe/BE571
https://www.jcrew.com/ca/p/girls/categories/clothing/sweaters/popovers/girls-ruffle-detail-sweater-in-colorful-stripe/BE571
2.
https://www.jcrew.com/au/p/girls/categories/clothing/sweaters/popovers/girls-cotton-sweater-with-embroidered-hearts/BE512
https://www.jcrew.com/p/girls/categories/clothing/sweaters/popovers/girls-cotton-sweater-with-embroidered-hearts/BE512
https://www.jcrew.com/ca/p/girls/categories/clothing/sweaters/popovers/girls-cotton-sweater-with-embroidered-hearts/BE512
3.
https://www.jcrew.com/au/p/girls/categories/clothing/sweaters/popovers/girls-ribbed-turtleneck-sweater-in-stripes/AC184
https://www.jcrew.com/p/girls/categories/clothing/sweaters/popovers/girls-ribbed-turtleneck-

## Export Model

In [24]:
import pickle

model_path = Path() / "model"

urls_file = model_path / "urls.csv"
corpus_file = model_path / "corpus.csv"
model_file = model_path / "model.pkl"

urls.to_csv(urls_file, index=False)

corpus.to_csv(corpus_file) # index should be  "url index" column

model = (vectorizer, doc_term_matrix)
with open(model_file, "wb") as f:
    pickle.dump(model, f)