#  Final Project: Etsy Marketplace Exploration

## Data Aquisition

In [1]:
# import HTML tools
from lxml import html
import xml.etree.ElementTree as ET

In [2]:
# read in the list of etsy page categories
import pandas as pd
etsy_pages = pd.read_csv("etsy_category_pages.csv")


In [3]:
page_cat = etsy_pages['page'].tolist()

In [4]:
# generate list of urls for each category

urls = []
start = "https://www.etsy.com/c/"

for x in page_cat:
    urls.append(start + x)

In [5]:
# do the urls have a page associated with them?
import requests
page_response2 = []
page = ""
for x in urls:
    page = requests.get(x)
    page_response2.append(page)
    
zip(page_cat,page_response2)

[('accessories', <Response [200]>),
 ('bags-and-purses', <Response [200]>),
 ('clothing', <Response [200]>),
 ('shoes', <Response [200]>),
 ('jewelry', <Response [200]>),
 ('craft-supplies-and-tools', <Response [200]>),
 ('weddings', <Response [200]>),
 ('books-movies-and-music', <Response [200]>),
 ('electronics-and-accessories', <Response [200]>),
 ('toys-and-games', <Response [200]>),
 ('art-and-collectibles', <Response [200]>),
 ('bath-and-beauty', <Response [200]>),
 ('home-and-living', <Response [200]>),
 ('paper-and-party-supplies', <Response [200]>),
 ('pet-supplies', <Response [200]>)]

In [6]:
# All of these categories get a successful response 
# Now that we have a list of catgories with associated web pages, we can scrape subcategories
# After we collect subcategores we can get the shop names from each subcategory page

In [7]:
# Find all html elements which contain the links to the subcategory pages
import lxml
sub_elements = []

for url in urls:
    #send the request
    content = requests.get(url)
    #save the page source code to a string called content_string
    content_string = content.text.encode("utf-8")
    #pass the page source to our html parse
    doc = lxml.html.document_fromstring(content_string)
    # Find html element containing each subcategory
    element = doc.xpath("//li[@class='pb-xs-1 pl-xs-0']")
    sub_elements.append(element)

In [8]:
# Get urls for each of the subcategories and store them in a list
sub_links = []

for element in sub_elements:
    for x in element:
        child = x.getchildren()
        new = child[0].attrib['href']
        sub_links.append(new)

In [None]:
# Save subcategory links in a csv file
import csv

with open('subcategory_links.csv', 'w') as csvfile:
    fieldnames=['link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # iterate through category and subcategory
    for item in sub_links:
        writer.writerow({'link': item})

In [None]:
# open csv subcategory link file
newdata = pd.read_csv("subcategory_links.csv",sep="/",names=('0','1','2','3','top_category','link_address'))
del(newdata['0'],newdata['1'],newdata['2'],newdata['3'])
catdf = newdata.dropna()


In [None]:
# extract subcatgory name from link and save as new csv file
def clean_links(text):
    return text.split("?")[0]

catdf['sub_category'] = catdf['link_address'].apply(clean_links)
catdf.to_csv(path_or_buf = "category_list")

In [None]:
sub_links = pd.read_csv("subcategory_links.csv")

In [None]:
sub_links['source'] = ""

In [None]:
# get source code for every subcategory link and append to the sub_links dataframe    
for i in range(len(sub_links)):
    content = sub_links['link'][i]
    source_code = requests.get(content)
    source_string = source_code.text.encode("utf-8")
    sub_links['source'][i] = source_string

In [None]:
# join dataframes

catdf = catdf.reset_index()
catdf = catdf.join(sub_links)
del catdf['index']
del catdf['link_address']
#catdf = catdf.set_index(['sub_category'], drop = False)

catdf.head(5)

In [None]:
# finds all shop names from html source string and appends to list
# returns list

def get_shop_names(source_string):
    j = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='card-meta-row-item text-truncate overflow-hidden card-shop-name']")
    for tag in tags:
        j.append(tag.text)
    return j

In [None]:
# create new column to store list of shop names
catdf['shops'] = ""

In [None]:
# populate column with list of shop names
for i in range(len(catdf)):
    l = get_shop_names(catdf['source'][i])
    catdf['shops'][i] = l

In [None]:
# takes raw list containing shop names
#returns clean list with shop name only

import re

def clean_name(messy_name_list):
    new_list = []
    for name in messy_name_list:
        finder = re.compile("\n" + "(.*?)" + "\n", re.IGNORECASE)
        matches = finder.findall(name)
        no_replace = re.compile("\s+")
        p = no_replace.sub("", matches[0])
        new_list.append(p)
    return new_list

In [None]:
# apply function to every list in column 'shops'
for i in range(len(catdf)):
    catdf['shops'][i] = clean_name(catdf['shops'][i])

In [None]:
catdf['review_links'] = ""

In [None]:
catdf.head(5)

In [None]:
# generate urls for each shop name in the list

for i in range(len(catdf)):
    urls = []
    for x in catdf['shops'][i]:
        urls.append("https://www.etsy.com/shop/"+ x +"/reviews")
    catdf['review_links'][i] = urls

In [None]:
len(catdf['review_links'][0])

In [None]:
p_elements = []
a_elements = []
review_dates = []    
def review_dates_per_shop(i,j):
    shop = catdf['review_links'][i][j]
    content = requests.get(shop)
    content_string = content.text.encode("utf-8")
    doc = lxml.html.document_fromstring(content_string)
    tags = doc.xpath("//div[@class='mt-xs-2 mb-xs-2']")
    for tag in tags:
        p_elements.append(tag.getchildren())
    for p in p_elements:
        for x in p:
            a_elements.append(x.getchildren())
    for element in a_elements:
        for x in element:
            review_dates.append(x.tail)
    return {catdf['shops'][i][j] : review_dates}

In [None]:
new = review_dates_per_shop(0,0)

In [None]:
new

In [None]:
new2 = review_dates_per_shop(0,1)
new2

In [None]:
for i in range(3):
    #date_list = []
    x = review_dates_per_shop(0,i)
    #date_list.append(x)
    print x

In [None]:
catdf['shop_review_dates'] = ""

In [None]:
def pull_dates(m,n):
    while n < 43:
        my_dict = review_dates_per_shop(m,n)
        poop.append(my_dict)
        n =+ 1
    return poop

In [None]:
poop = []