In [1]:
import pandas as pd
import numpy as np
import snowflake.connector
from collections import defaultdict
import json
import re
import requests

In [2]:
#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [None]:
#Get list of crawled product variants
sql = '''
select 
distinct (PVID) as PVID,
brand,
_ID,
unique_id,
scraped_attributes :sku sku,
scraped_attributes :item_group_id item_group_id,
scraped_attributes :link link,
scraped_attributes :title title,
scraped_attributes :display_color display_color,
scraped_attributes :size size,
scraped_attributes :options options,
trim (g.value: productIds:: string ,'[]""') AS MATCHED_CI_ID,
--trim (FINAL_PRODUCT_MATCHES: productIds:: string ,'[]""') AS MATCHED_CI_ID,
LAST_CRAWLED_DATE,
status,
FINAL_PRODUCT_MATCHES
from
  PUBLIC.PRODUCTVARIANTS,
  lateral flatten(input => FINAL_PRODUCT_MATCHES) g
where
   brand like '%journelle%'
'''
cs.execute(sql)

pvdf = cs.fetch_pandas_all()


In [None]:
#Get list of Customer Inputs
sql = '''
with state as (
  select
    _id,
    CATALOG_ID,
    state
  from
    scale_prod.view.variant_stats_latest_matching
  where
    source_input = 'sent_by_customer'
    and state in (
      'unmatched_no_data_found',
      'unmatched_matching_issue',
      'unmatched_other_reasons'
    )
    and catalog_id in ('245276287025026')
)    
select
  ci.raw_site_url,
  state.catalog_id,
  ci.customer,
  state.state,
  ci._id,
  product_id,
  attribute_data:gtin gtin,
  attribute_data:mfr_part_no mfr_part_no,
  attribute_data:retailer_item_id retailer_item_id, 
  raw_url link,
  attribute_data:name name,
  attribute_data:color :: string color,
  attribute_data:size :: string size,
  attribute_data:gender gender,
  attribute_data:subvertical_attributes sab,
  attribute_data:custom_data :: string cd,
  ci.updated_at

  
from
  state
  join PUBLIC.CUSTOMERINPUTS ci on state.catalog_id = ci.catalog_id
  --join PUBLIC.SITECRAWLS sc on ci.raw_site_url = sc.SITE_URL
  and ci._id = state._id
  and ci.customer = 'flamingo'
--and ci.tags like '%priority-highimp-2022%'
order by catalog_id,name,color,size
'''
cs.execute(sql)

cidf = cs.fetch_pandas_all()

In [None]:
#get count of inputs to compare
pvcount = len(pvdf.index)
cicount = len(cidf.index)
print (cicount, 'customer inputs need to be matched with', pvcount,'crawled variants')

In [None]:
#clean datasets of unnecessary characters, get variant number from url into a new column, extract duplicate customer inputs
cidf['RETAILER_ITEM_ID'] = cidf['RETAILER_ITEM_ID'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
cidf['LINK'] = cidf['LINK'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
cidf['NAME'] = cidf['NAME'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
cidf['GENDER'] = cidf['GENDER'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
cidf['PRODUCT_ID'] = cidf['PRODUCT_ID'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
cidf['VARIANTNUM'] = cidf['LINK'].str.slice(-14,)

pvdf['SKU'] = pvdf['SKU'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
pvdf['ITEM_GROUP_ID'] = pvdf['ITEM_GROUP_ID'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
pvdf['LINK'] = pvdf['LINK'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
pvdf['TITLE'] = pvdf['TITLE'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
pvdf['DISPLAY_COLOR'] = pvdf['DISPLAY_COLOR'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
pvdf['SIZE'] = pvdf['SIZE'].map(lambda x: x.lstrip('"').rstrip('"'), na_action='ignore')
pvdf['VARIANTNUM'] = pvdf['LINK'].str.slice(-14,)

#list(cidf.columns)

In [None]:
#extract duplicate CIs in a new dataframe
duplicatecilink = cidf[cidf.duplicated(['LINK'])]
duplicatecivariantnum = cidf[cidf.duplicated(['VARIANTNUM'])]
cidf['Attributes'] =  cidf['NAME'] + cidf['COLOR'] + cidf['SIZE']
duplicateciattributes = cidf[cidf.duplicated(['Attributes'])]
#duplicateciattributes.to_csv('duplicateci.csv')

duplinks = len(duplicatecilink.index)
dupvars = len(duplicatecivariantnum.index)
dupatt = len(duplicateciattributes.index)
#print("Duplicate links =", duplinks,"| Duplicate Variant Numbers =", dupvars,"| Duplicate Attributes =", dupatt)

cidf.drop_duplicates(subset=['VARIANTNUM'], keep="first", inplace=True)
dedupcicount = len(cidf.index)
print(dedupcicount)
#pvdf
#cidf

In [None]:
#apply matching strategy 'urlvariantnum', match by variant id present in LINK in CI and PVs

#def matchbyurlvariantnumber ():
mergedf = cidf.merge(pvdf,left_on=['VARIANTNUM'],right_on=['VARIANTNUM'],how='left')
mergedf = mergedf.reindex(sorted(mergedf.columns), axis=1)
mergedf = mergedf [['NAME','TITLE','VARIANTNUM', 'COLOR', 'DISPLAY_COLOR', 'SIZE_x', 'SIZE_y', 'LINK_x', 'LINK_y', 'MFR_PART_NO', 'SKU', 'PRODUCT_ID', 'GTIN', 'ITEM_GROUP_ID', 'MATCHED_CI_ID', 'PVID','BRAND', 'CATALOG_ID', 'CD', 'CUSTOMER', 'FINAL_PRODUCT_MATCHES', 'GENDER', 'LAST_CRAWLED_DATE', 'OPTIONS', 'RAW_SITE_URL', 'RETAILER_ITEM_ID', 'SAB', 'STATE', 'STATUS', 'UNIQUE_ID', 'UPDATED_AT', '_ID_x', '_ID_y']]

unmatchcount = mergedf['TITLE'].isna().sum()
print(dupvars, cicount, dedupcicount, unmatchcount)
matchrate_urlvariantnum = (1-((unmatchcount)/(dedupcicount)))
print("Number of unmatched CIs =",unmatchcount,"| Strategy Match =",matchrate_urlvariantnum)

#df[df[2].isna()]
unmatcheddf = mergedf[mergedf['TITLE'].isna()]
unmatcheddfc = len(unmatcheddf.index)
#unmatcheddf
#unmatcheddfc
#list(mergedf.columns)

In [None]:
#check for 'No Data Found' or 404s in the unmatched CIs

for i in range(len(unmatcheddf)):
    if i == 0:
        i = 1 
    response = requests.get(unmatcheddf['LINK_x'][i])                     
                            #, allow_redirects='False', timeout=5)
    print(response)
    
    
#    try:
#        print("Checking link for NDF #",i," out of ",unmatcheddfc,unmatcheddf['LINK_x'][i])
#        response = requests.get(unmatcheddf['LINK_x'][i], allow_redirects='False', timeout=5)
#        unmatcheddf.at[i,'urlresponse'] = response
#    except:
#        print("Error in line",i)
        
    
#unmatcheddf
#response2 = requests.get('https://usapalm.com/products/pique-stretch-polo?variant=32078010351705', timeout=0.1, allow_redirects='false')
#response3 = 
#print(response2,response3)
unmatcheddf.to_csv('ndf.csv')