#  Final Project: Etsy Marketplace Exploration

## Data Aquisition

In [2]:
# import HTML tools
from lxml import html
import xml.etree.ElementTree as ET

In [1]:
# read in the list of etsy page categories
import pandas as pd
etsy_pages = pd.read_csv("etsy_category_pages.csv")


In [3]:
page_cat = etsy_pages['page'].tolist()

In [4]:
# generate list of urls for each category

urls = []
start = "https://www.etsy.com/c/"

for x in page_cat:
    urls.append(start + x)

In [6]:
# do the urls have a page associated with them?
import requests
page_response2 = []
page = ""
for x in urls:
    page = requests.get(x)
    page_response2.append(page)
    
zip(page_cat,page_response2)

[('accessories', <Response [200]>),
 ('bags-and-purses', <Response [200]>),
 ('clothing', <Response [200]>),
 ('shoes', <Response [200]>),
 ('jewelry', <Response [200]>),
 ('craft-supplies-and-tools', <Response [200]>),
 ('weddings', <Response [200]>),
 ('books-movies-and-music', <Response [200]>),
 ('electronics-and-accessories', <Response [200]>),
 ('toys-and-games', <Response [200]>),
 ('art-and-collectibles', <Response [200]>),
 ('bath-and-beauty', <Response [200]>),
 ('home-and-living', <Response [200]>),
 ('paper-and-party-supplies', <Response [200]>),
 ('pet-supplies', <Response [200]>)]

In [7]:
# All of these categories get a successful response 
# Now that we have a list of catgories with associated web pages, we can scrape subcategories
# After we collect subcategores we can get the shop names from each subcategory page

In [8]:
# Find all html elements which contain the links to the subcategory pages
import lxml
sub_elements = []

for url in urls:
    #send the request
    content = requests.get(url)
    #save the page source code to a string called content_string
    content_string = content.text.encode("utf-8")
    #pass the page source to our html parse
    doc = lxml.html.document_fromstring(content_string)
    # Find html element containing each subcategory
    element = doc.xpath("//li[@class='pb-xs-1 pl-xs-0']")
    sub_elements.append(element)

In [9]:
# Get urls for each of the subcategories and store them in a list
sub_links = []

for element in sub_elements:
    for x in element:
        child = x.getchildren()
        new = child[0].attrib['href']
        sub_links.append(new)

In [12]:
# Save subcategory links in a csv file
import csv

with open('subcategory_links.csv', 'w') as csvfile:
    fieldnames=['link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # iterate through category and subcategory
    for item in sub_links:
        writer.writerow({'link': item})

In [13]:
# open csv subcategory link file
newdata = pd.read_csv("subcategory_links.csv",sep="/",names=('0','1','2','3','top_category','link_address'))
del(newdata['0'],newdata['1'],newdata['2'],newdata['3'])
catdf = newdata.dropna()


In [14]:
# extract subcatgory name from link and save as new csv file
def clean_links(text):
    return text.split("?")[0]

catdf['sub_category'] = catdf['link_address'].apply(clean_links)
catdf.to_csv(path_or_buf = "category_list")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
sub_links = pd.read_csv("subcategory_links.csv")

In [25]:
# get source code for every subcategory link and append to the sub_links dataframe    
for i in range(len(sub_links)):
    content = sub_links['link'][i]
    source_code = requests.get(content)
    source_string = source_code.text.encode("utf-8")
    sub_links['source'][i] = source_string

In [27]:
# join dataframes

catdf = catdf.reset_index()
catdf = catdf.join(sub_links)
del catdf['index']
del catdf['link_address']
#catdf = catdf.set_index(['sub_category'], drop = False)

catdf

Unnamed: 0,top_category,sub_category,link,source
0,accessories,hair-accessories,https://www.etsy.com/c/accessories/hair-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
1,accessories,hats-and-caps,https://www.etsy.com/c/accessories/hats-and-ca...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
2,accessories,baby-accessories,https://www.etsy.com/c/accessories/baby-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
3,accessories,scarves-and-wraps,https://www.etsy.com/c/accessories/scarves-and...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
4,accessories,keychains-and-lanyards,https://www.etsy.com/c/accessories/keychains-a...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
5,accessories,suit-and-tie-accessories,https://www.etsy.com/c/accessories/suit-and-ti...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
6,accessories,patches-and-pins,https://www.etsy.com/c/accessories/patches-and...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
7,accessories,belts-and-suspenders,https://www.etsy.com/c/accessories/belts-and-s...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
8,accessories,costume-accessories,https://www.etsy.com/c/accessories/costume-acc...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
9,accessories,sunglasses-and-eyewear,https://www.etsy.com/c/accessories/sunglasses-...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."


In [28]:
# finds all shop names from html source string and appends to list
# returns list

def get_shop_names(source_string):
    j = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='card-meta-row-item text-truncate overflow-hidden card-shop-name']")
    for tag in tags:
        j.append(tag.text)
    return j

In [30]:
# create new column to store list of shop names
catdf['shops'] = ""

In [31]:
# populate column with list of shop names
for i in range(len(catdf)):
    l = get_shop_names(catdf['source'][i])
    catdf['shops'][i] = l

In [48]:
# takes raw list containing shop names
#returns clean list with shop name only

import re

def clean_name(messy_name_list):
    new_list = []
    for name in messy_name_list:
        finder = re.compile("\n" + "(.*?)" + "\n", re.IGNORECASE)
        matches = finder.findall(name)
        no_replace = re.compile("\s+")
        p = no_replace.sub("", matches[0])
        new_list.append(p)
    return new_list

In [50]:
# apply function to every list in column 'shops'
for i in range(len(catdf)):
    catdf['shops'][i] = clean_name(catdf['shops'][i])

In [54]:
catdf['review_links'] = ""

In [55]:
# generate urls for each shop name in the list

for i in range(len(catdf)):
    urls = []
    for x in catdf['shops'][i]:
        urls.append("https://www.etsy.com/shop/"+ x +"/reviews")
    catdf['review_links'][i] = urls

In [57]:
catdf['review_links']

0      [https://www.etsy.com/shop/ForUrPrincess22/rev...
1      [https://www.etsy.com/shop/TheTurnipSeed/revie...
2      [https://www.etsy.com/shop/LovelyFlowersBloom/...
3      [https://www.etsy.com/shop/Aslidesign/reviews,...
4      [https://www.etsy.com/shop/BohemianEarthDesign...
5      [https://www.etsy.com/shop/PaperAnniversaryLov...
6      [https://www.etsy.com/shop/ExtremeLargeness/re...
7      [https://www.etsy.com/shop/TalismanaDesigns/re...
8      [https://www.etsy.com/shop/Lightthedynamite/re...
9      [https://www.etsy.com/shop/AmericanDeadstock/r...
10     [https://www.etsy.com/shop/handicraftland/revi...
11     [https://www.etsy.com/shop/RedWalrusShoppe/rev...
12     [https://www.etsy.com/shop/WearItMiniPlants/re...
13     [https://www.etsy.com/shop/iPhoneCaseLove/revi...
14     [https://www.etsy.com/shop/LBHCreations/review...
15     [https://www.etsy.com/shop/DashForward/reviews...
16     [https://www.etsy.com/shop/Divanitas/reviews, ...
17     [https://www.etsy.com/sh

In [48]:
 # get source code for shop review pages
store_here = []
    
for url in shop_review_urls:
    #send the request
    content = requests.get(url)
    #save the page source code to a string called content_string
    content_string = content.text.encode("utf-8")
    #pass the page source to our html parse
    doc = lxml.html.document_fromstring(content_string)
    # Find html element containing each subcategory
    #element = doc.xpath("//li[@class='pb-xs-1 pl-xs-0']")
    store_here.append(doc)

In [49]:
print store_here

[<Element html at 0x114cf18e8>]


## Trials and Attempts

In [2]:
# API call to get list of categories
categories = requests.get("https://openapi.etsy.com/v2/taxonomy/categories?api_key=b1j6zj00oxibenikkokathh5")

In [None]:
string = re.compile(sub_string[0])
string.search()