#  Final Project: Etsy Marketplace Exploration

## Data Aquisition

In [1]:
# import HTML tools
from lxml import html
import xml.etree.ElementTree as ET

In [2]:
# read in the list of etsy page categories
import pandas as pd
etsy_pages = pd.read_csv("etsy_category_pages.csv")


In [3]:
page_cat = etsy_pages['page'].tolist()

In [4]:
# generate list of urls for each category

urls = []
start = "https://www.etsy.com/c/"

for x in page_cat:
    urls.append(start + x)

In [5]:
# All of these categories get a successful response 
# Now that we have a list of catgories with associated web pages, we can scrape subcategories
# After we collect subcategores we can get the shop names from each subcategory page

In [7]:
# Find all html elements which contain the links to the subcategory pages
import requests
import lxml
sub_elements = []

for url in urls:
    #send the request
    content = requests.get(url)
    #save the page source code to a string called content_string
    content_string = content.text.encode("utf-8")
    #pass the page source to our html parse
    doc = lxml.html.document_fromstring(content_string)
    # Find html element containing each subcategory
    element = doc.xpath("//li[@class='pb-xs-1 pl-xs-0']")
    sub_elements.append(element)

In [8]:
# Get urls for each of the subcategories and store them in a list
sub_links = []

for element in sub_elements:
    for x in element:
        child = x.getchildren()
        new = child[0].attrib['href']
        sub_links.append(new)

In [9]:
# Save subcategory links in a csv file
import csv

with open('subcategory_links.csv', 'w') as csvfile:
    fieldnames=['link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # iterate through category and subcategory
    for item in sub_links:
        writer.writerow({'link': item})

In [10]:
# open csv subcategory link file
newdata = pd.read_csv("subcategory_links.csv",sep="/",names=('0','1','2','3','top_category','link_address'))
del(newdata['0'],newdata['1'],newdata['2'],newdata['3'])
catdf = newdata.dropna()


In [11]:
# extract subcatgory name from link and save as new csv file
def clean_links(text):
    return text.split("?")[0]

catdf['sub_category'] = catdf['link_address'].apply(clean_links)
#catdf.to_csv(path_or_buf = "category_list")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
sub_links = pd.read_csv("subcategory_links.csv")

In [13]:
sub_links['source'] = ""

In [14]:
# get source code for every subcategory link and append to the sub_links dataframe    
for i in range(len(sub_links)):
    content = sub_links['link'][i]
    source_code = requests.get(content)
    source_string = source_code.text.encode("utf-8")
    sub_links['source'][i] = source_string

In [15]:
# join dataframes

catdf = catdf.reset_index()
catdf = catdf.join(sub_links)
del catdf['index']
del catdf['link_address']
#catdf = catdf.set_index(['sub_category'], drop = False)

catdf.head(5)

Unnamed: 0,top_category,sub_category,link,source
0,accessories,hair-accessories,https://www.etsy.com/c/accessories/hair-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
1,accessories,hats-and-caps,https://www.etsy.com/c/accessories/hats-and-ca...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
2,accessories,baby-accessories,https://www.etsy.com/c/accessories/baby-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
3,accessories,scarves-and-wraps,https://www.etsy.com/c/accessories/scarves-and...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
4,accessories,keychains-and-lanyards,https://www.etsy.com/c/accessories/keychains-a...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."


In [16]:
# finds all shop names from html source string and appends to list
# returns list

def get_shop_names(source_string):
    j = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='card-meta-row-item text-truncate overflow-hidden card-shop-name']")
    for tag in tags:
        j.append(tag.text)
    return j

In [17]:
# create new column to store list of shop names
catdf['shops'] = ""

In [18]:
# populate column with list of shop names
for i in range(len(catdf)):
    l = get_shop_names(catdf['source'][i])
    catdf['shops'][i] = l

In [19]:
# takes raw list containing shop names
#returns clean list with shop name only

import re

def clean_name(messy_name_list):
    new_list = []
    for name in messy_name_list:
        finder = re.compile("\n" + "(.*?)" + "\n", re.IGNORECASE)
        matches = finder.findall(name)
        no_replace = re.compile("\s+")
        p = no_replace.sub("", matches[0])
        new_list.append(p)
    return new_list

In [20]:
# apply function to every list in column 'shops'
for i in range(len(catdf)):
    catdf['shops'][i] = clean_name(catdf['shops'][i])

In [21]:
catdf.head(5)

Unnamed: 0,top_category,sub_category,link,source,shops
0,accessories,hair-accessories,https://www.etsy.com/c/accessories/hair-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[PoshPelicanCo, MustHaveBows, LoveMiaCo, jahan..."
1,accessories,hats-and-caps,https://www.etsy.com/c/accessories/hats-and-ca...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[BeanieVille, BlancaVeils, GorgeousComplements..."
2,accessories,baby-accessories,https://www.etsy.com/c/accessories/baby-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[BabySeblime, SweetAndStitched, SkylarnMe, Pos..."
3,accessories,scarves-and-wraps,https://www.etsy.com/c/accessories/scarves-and...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[MiracleShine, Avaneska, CutieChicBoutique, Zo..."
4,accessories,keychains-and-lanyards,https://www.etsy.com/c/accessories/keychains-a...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ChangeandCharms, LLDPetBoutique, catonealdesi..."


In [22]:
all_shops = []

for i in range(len(catdf['shops'])):
    for x in catdf['shops'][i]:
        all_shops.append(x)

In [23]:
shop_df = pd.DataFrame(data = all_shops)

In [24]:
shop_df['shop_name'] = shop_df[0]

In [25]:
shop_df['review_links'] = ""

In [26]:
# generate urls for each shop name in the list

for i in range(len(shop_df)):
    x = shop_df['shop_name'][i]
    url = ("https://www.etsy.com/shop/"+ x +"/reviews")
    shop_df['review_links'][i] = url

In [27]:
shop_df.head(5)

Unnamed: 0,0,shop_name,review_links
0,PoshPelicanCo,PoshPelicanCo,https://www.etsy.com/shop/PoshPelicanCo/reviews
1,MustHaveBows,MustHaveBows,https://www.etsy.com/shop/MustHaveBows/reviews
2,LoveMiaCo,LoveMiaCo,https://www.etsy.com/shop/LoveMiaCo/reviews
3,jahannamartinez,jahannamartinez,https://www.etsy.com/shop/jahannamartinez/reviews
4,OnceUponATwincess,OnceUponATwincess,https://www.etsy.com/shop/OnceUponATwincess/re...


In [40]:
# get source code for every review link and append to the shop_df dataframe    

#shop_df['review_source'] = ""

for i in range(0,100):
    content = shop_df['review_links'][i]
    source_code = requests.get(content)
    source_string = source_code.text.encode("utf-8")
    shop_df['review_source'][i] = source_string
    print i

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [45]:
shop_df['input'] = shop_df.index

In [46]:
shop_df.head(2)

Unnamed: 0,0,shop_name,review_links,review_source,input
0,PoshPelicanCo,PoshPelicanCo,https://www.etsy.com/shop/PoshPelicanCo/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",0
1,MustHaveBows,MustHaveBows,https://www.etsy.com/shop/MustHaveBows/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",1


In [57]:
p_elements = []
a_elements = []
review_dates = []
def get_shop_dates(source_string):
    j = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='mt-xs-2 mb-xs-2']")
    
    for tag in tags:
        p_elements.append(tag.getchildren())
    for p in p_elements:
        for x in p:
            a_elements.append(x.getchildren())
    for element in a_elements:
        for x in element:
            review_dates.append(x.tail)
    return review_dates

In [58]:
shop_df['review_dates'] = ""

In [None]:
# populate column with list of review dates
for i in range(len(shop_df)):
    l = get_shop_dates(shop_df['review_source'][i])
    shop_df['review_dates'][i] = l

In [None]:
shop_df.head(3)

In [None]:
shop_df.to_csv(path_or_buf = "shop_data")

In [55]:
p_elements = []
a_elements = []
review_dates = []    
def review_dates_per_shop(i):
    content = shop_df['review_source'][i]
    content_string = content.text.encode("utf-8")
    doc = lxml.html.document_fromstring(content_string)
    tags = doc.xpath("//div[@class='mt-xs-2 mb-xs-2']")
    
    for tag in tags:
        p_elements.append(tag.getchildren())
    for p in p_elements:
        for x in p:
            a_elements.append(x.getchildren())
    for element in a_elements:
        for x in element:
            review_dates.append(x.tail)
    return review_dates

In [47]:
shop_df['review_dates'] = ""

In [53]:
for i in range(1):
    shop_df['review_dates'][i] = ['a','b']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [54]:
shop_df.head(2)

Unnamed: 0,0,shop_name,review_links,review_source,input,review_dates
0,PoshPelicanCo,PoshPelicanCo,https://www.etsy.com/shop/PoshPelicanCo/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",0,"[a, b]"
1,MustHaveBows,MustHaveBows,https://www.etsy.com/shop/MustHaveBows/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",1,


In [None]:
for i in range(3):
    #date_list = []
    x = review_dates_per_shop(0,i)
    #date_list.append(x)
    print x

In [None]:
def pull_dates(m,n):
    while n < 43:
        my_dict = review_dates_per_shop(m,n)
        poop.append(my_dict)
        n =+ 1
    return poop

In [None]:
list_of_dates = []
for i in range(len(shop_df)):
    x = review_dates_per_shop(i)
    list_of_dates.append(x)