# Creating spider for grain-free

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import pandas as pd
import requests
import json

In [None]:
class ChewySpider(scrapy.Spider):
    name = "Chewy_grain"
    start_urls = ['https://www.chewy.com/b/grain-free-11100']

    def parse(self, response):
        for g_products in response.css('div.kib-product-card__content'):
            yield {
            'brand_name':  g_products.css('a.kib-product-title ::text').get(),
             'food_type':  g_products.css('a.kib-product-title *::text').re_first(r"^\s{1}[(\w\s)-|&|']+"),
            'price':  g_products.css('div.kib-product-price--md::text').get(),
            'rating':  g_products.css('div.kib-product-rating__label::text').get(),
             'no_review':  g_products.css('span.kib-product-rating__count::text').get(),
             'link' : g_products.css('a.kib-product-title::attr(href)').get(),
            }
            
        next_page= response.css('a.kib-pagination-new-item--next').attrib['href']
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

process = CrawlerProcess(settings={
    "FEEDS": {
        "grain_free.csv": {"format": "csv"},
    },
})

process.crawl(ChewySpider)
process.start()
    
    

In [None]:
#Convert to csv
pd.options.display.max_rows = 800
gf = pd.read_csv('grain_free.csv')
gf = gf.dropna().reset_index(drop=True)
gf

# Cleaning dataset

In [None]:
#Prep for changing data type
gf['price'] = gf['price'].str.replace('$', '')
gf['no_review'] = gf['no_review'].str.replace(',', '')

In [None]:
#Change data types
gf['price'] = gf['price'].astype('float')
gf['no_review'] = gf['no_review'].astype('int')
gf['rating'] = gf['rating'].astype('str')

In [None]:
#Use regex to extract rating and change to float
pattern = r'(\d\.\d*)'

gf['rating'] = gf['rating'].str.extract(pattern)
gf['rating'] = gf['rating'].astype('float')

In [None]:
#Filter for products with at least 100 reviews
gf_100=gf[gf['no_review']>100].reset_index(drop=True)
gf_100

# Crawl through all pages

In [None]:
#Extract product_id number from end of url
gf_100_link = gf_100.link.str.extract(r'dp\/(\d+)', expand = True)
gf_link = gf_100_link[0].values.tolist()
len(gf_link)

In [None]:
#Chewy does not paginate, uses GraphQL to show more products 
#Use product_id and first "after" variable to start crawling
URL = 'https://www.chewy.com/api/pdp-graphql/graphql'
headers = {
    'user-agent': 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36'
}

def get_params(product_id,after):
    params = {
    'operationName': 'Reviews',
    'variables': '{"id":"'+product_id+'","first":100,"sort":"MOST_RELEVANT","after":"'+after+'"}',
    'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"f1ad95c550af020ebbc5c6da08fd478b1cf25b5e911cba4710d773b84e063730"}}'
}
    return params

In [None]:
 def get_reviews(product_id):
    
        reviews = []
        hasNextPage = True
        after = "YXJyYXljb25uZWN0aW9uOjk="

        while hasNextPage:
            params = get_params(product_id,after)
            response = requests.get(URL,
                                params = params,
                                headers = headers)
            #return response
            
            after = (response.json()['data']['product']['reviews']['pageInfo']['endCursor'])
            hasNextPage = (response.json()['data']['product']['reviews']['pageInfo']['hasNextPage'])
            rev = (response.json()['data']['product']['reviews']['edges'])
            reviews.extend(rev)

        filepath = f"data/reviews/reviews_{product_id}.json"
        with open(filepath,"w") as fi:
            json.dump(reviews, fi)
            #reviews.append(ids)

In [None]:
product_id= gf_link

for product_id in gf_link:
    
   get_reviews(product_id)

In [None]:
response = get_reviews(ids)
response.json()
ids[0]