#  Final Project: Etsy Marketplace Exploration

## Data Aquisition

In [1]:
# import HTML tools
from lxml import html
import xml.etree.ElementTree as ET

In [2]:
# read in the list of etsy page categories
import pandas as pd
etsy_pages = pd.read_csv("etsy_category_pages.csv")


In [3]:
page_cat = etsy_pages['page'].tolist()

In [4]:
# generate list of urls for each category

urls = []
start = "https://www.etsy.com/c/"

for x in page_cat:
    urls.append(start + x)

In [5]:
# All of these categories get a successful response 
# Now that we have a list of catgories with associated web pages, we can scrape subcategories
# After we collect subcategores we can get the shop names from each subcategory page

In [6]:
# Find all html elements which contain the links to the subcategory pages
import requests
import lxml
sub_elements = []

for url in urls:
    #send the request
    content = requests.get(url)
    #save the page source code to a string called content_string
    content_string = content.text.encode("utf-8")
    #pass the page source to our html parse
    doc = lxml.html.document_fromstring(content_string)
    # Find html element containing each subcategory
    element = doc.xpath("//li[@class='pb-xs-1 pl-xs-0']")
    sub_elements.append(element)

In [7]:
# Get urls for each of the subcategories and store them in a list
sub_links = []

for element in sub_elements:
    for x in element:
        child = x.getchildren()
        new = child[0].attrib['href']
        sub_links.append(new)

In [8]:
# Save subcategory links in a csv file
import csv

with open('subcategory_links.csv', 'w') as csvfile:
    fieldnames=['link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # iterate through category and subcategory
    for item in sub_links:
        writer.writerow({'link': item})

In [9]:
# open csv subcategory link file
newdata = pd.read_csv("subcategory_links.csv",sep="/",names=('0','1','2','3','top_category','link_address'))
del(newdata['0'],newdata['1'],newdata['2'],newdata['3'])
catdf = newdata.dropna()


In [10]:
# extract subcatgory name from link and save as new csv file
def clean_links(text):
    return text.split("?")[0]

catdf['sub_category'] = catdf['link_address'].apply(clean_links)
#catdf.to_csv(path_or_buf = "category_list")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
sub_links = pd.read_csv("subcategory_links.csv")

In [12]:
sub_links['source'] = ""

In [13]:
# get source code for every subcategory link and append to the sub_links dataframe    
for i in range(len(sub_links)):
    content = sub_links['link'][i]
    source_code = requests.get(content)
    source_string = source_code.text.encode("utf-8")
    sub_links['source'][i] = source_string

In [14]:
# join dataframes

catdf = catdf.reset_index()
catdf = catdf.join(sub_links)
del catdf['index']
del catdf['link_address']
#catdf = catdf.set_index(['sub_category'], drop = False)

catdf.head(5)

Unnamed: 0,top_category,sub_category,link,source
0,accessories,hair-accessories,https://www.etsy.com/c/accessories/hair-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
1,accessories,hats-and-caps,https://www.etsy.com/c/accessories/hats-and-ca...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
2,accessories,baby-accessories,https://www.etsy.com/c/accessories/baby-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
3,accessories,scarves-and-wraps,https://www.etsy.com/c/accessories/scarves-and...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
4,accessories,keychains-and-lanyards,https://www.etsy.com/c/accessories/keychains-a...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."


In [15]:
# finds all shop names from html source string and appends to list
# returns list

def get_shop_names(source_string):
    j = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='card-meta-row-item text-truncate overflow-hidden card-shop-name']")
    for tag in tags:
        j.append(tag.text)
    return j

In [16]:
# create new column to store list of shop names
catdf['shops'] = ""

In [17]:
# populate column with list of shop names
for i in range(len(catdf)):
    l = get_shop_names(catdf['source'][i])
    catdf['shops'][i] = l

In [18]:
# takes raw list containing shop names
#returns clean list with shop name only

import re

def clean_name(messy_name_list):
    new_list = []
    for name in messy_name_list:
        finder = re.compile("\n" + "(.*?)" + "\n", re.IGNORECASE)
        matches = finder.findall(name)
        no_replace = re.compile("\s+")
        p = no_replace.sub("", matches[0])
        new_list.append(p)
    return new_list

In [19]:
# apply function to every list in column 'shops'
for i in range(len(catdf)):
    catdf['shops'][i] = clean_name(catdf['shops'][i])

In [20]:
catdf.head(5)

Unnamed: 0,top_category,sub_category,link,source,shops
0,accessories,hair-accessories,https://www.etsy.com/c/accessories/hair-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ThinkPinkBows, LillyBowPeep, CrystalEclipseCr..."
1,accessories,hats-and-caps,https://www.etsy.com/c/accessories/hats-and-ca...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[VintageLovers4, BeanieVille, CaporCrap, Pinka..."
2,accessories,baby-accessories,https://www.etsy.com/c/accessories/baby-access...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[HenleysHeadbands, SkylarnMe, ThinkPinkBows, B..."
3,accessories,scarves-and-wraps,https://www.etsy.com/c/accessories/scarves-and...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[storiarts, MiracleShine, Futurerity2, MelScar..."
4,accessories,keychains-and-lanyards,https://www.etsy.com/c/accessories/keychains-a...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[artbymorgie, MelissasMonograms, Radmarkers, G..."


In [114]:
categories = catdf

In [115]:
del categories['source']

In [116]:
del categories['link']

In [117]:
categories.to_csv(path_or_buf = "category_data")

In [22]:
all_shops = []

for i in range(len(catdf['shops'])):
    for x in catdf['shops'][i]:
        all_shops.append(x)

In [23]:
shop_df = pd.DataFrame(data = all_shops)

In [24]:
shop_df['shop_name'] = shop_df[0]

In [25]:
shop_df['review_links'] = ""

In [26]:
len(shop_df)

6762

In [27]:
# generate urls for each shop name in the list

for i in range(len(shop_df)):
    x = shop_df['shop_name'][i]
    url = ("https://www.etsy.com/shop/"+ x +"/reviews")
    shop_df['review_links'][i] = url

In [28]:
shop_df.head(5)

Unnamed: 0,0,shop_name,review_links
0,ThinkPinkBows,ThinkPinkBows,https://www.etsy.com/shop/ThinkPinkBows/reviews
1,LillyBowPeep,LillyBowPeep,https://www.etsy.com/shop/LillyBowPeep/reviews
2,CrystalEclipseCrowns,CrystalEclipseCrowns,https://www.etsy.com/shop/CrystalEclipseCrowns...
3,LeChaim,LeChaim,https://www.etsy.com/shop/LeChaim/reviews
4,jahannamartinez,jahannamartinez,https://www.etsy.com/shop/jahannamartinez/reviews


In [29]:
# get source code for every review link and append to the shop_df dataframe
# try first 200 shops

shop_df['review_source'] = ""

for i in range(0,200):
    content = shop_df['review_links'][i]
    source_code = requests.get(content)
    source_string = source_code.text.encode("utf-8")
    shop_df['review_source'][i] = source_string
    print i

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [None]:
shop_df['input'] = shop_df.index

In [30]:
shop_df.head(2)

Unnamed: 0,0,shop_name,review_links,review_source
0,ThinkPinkBows,ThinkPinkBows,https://www.etsy.com/shop/ThinkPinkBows/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."
1,LillyBowPeep,LillyBowPeep,https://www.etsy.com/shop/LillyBowPeep/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."


In [101]:
# function to get list of item review dates from each shop review page

def get_shop_dates(source_string):
    p_elements = []
    a_elements = []
    list_dates = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='mt-xs-2 mb-xs-2']")
    
    for tag in tags:
        p_elements.append(tag.getchildren())
    for p in p_elements:
        for x in p:
            a_elements.append(x.getchildren())
    for element in a_elements:
        for x in element:
            list_dates.append(x.tail)
    return list_dates

In [32]:
shop_df['review_dates'] = ""

In [102]:
# populate column with list of review dates
# apply function to first 200 shops

for i in range(200):
    shop_df['review_dates'][i] = get_shop_dates(shop_df['review_source'][i])

In [104]:
# function to get list of item ratings from each shop review page

def get_review_ratings(source_string):
    span_elements = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//span[@class='screen-reader-only']")
    
    for tag in tags:
        span_elements.append(tag.text)
    return span_elements

In [63]:
shop_df['ratings'] = ""

In [105]:
# populate column with list of review ratings
# apply function to first 200 shops

for i in range(200):
    shop_df['ratings'][i] = get_review_ratings(shop_df['review_source'][i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [106]:
# function to get list of item review decriptions from each shop review page

def get_shop_details(source_string):
    p_elements = []
    a_elements = []
    list_details = []
    doc = lxml.html.document_fromstring(source_string)
    tags = doc.xpath("//div[@class='flag-body hide-xs hide-sm']")
    
    for tag in tags:
        p_elements.append(tag.getchildren())
    for p in p_elements:
        for x in p:
            a_elements.append(x.getchildren())
    for a in a_elements:
        for x in a:
            list_details.append(x.text)
    return list_details

In [94]:
shop_df['item_details'] = ""

In [107]:
# populate column with list of review descriptions
# apply function to first 200 shops

for i in range(200):
    shop_df['item_details'][i] = get_shop_details(shop_df['review_source'][i])

In [108]:
shop_df.head(10)

Unnamed: 0,0,shop_name,review_links,review_source,review_dates,ratings,item_details
0,ThinkPinkBows,ThinkPinkBows,https://www.etsy.com/shop/ThinkPinkBows/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[LadyBug lace romper, lace petti romper, rompe..."
1,LillyBowPeep,LillyBowPeep,https://www.etsy.com/shop/LillyBowPeep/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[lavender Petti Lace Romper, baby girl clothes..."
2,CrystalEclipseCrowns,CrystalEclipseCrowns,https://www.etsy.com/shop/CrystalEclipseCrowns...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 13, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Dog tooth amethyst nugget bead headband wrapp...
3,LeChaim,LeChaim,https://www.etsy.com/shop/LeChaim/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Hummingbird Necklace. Teardrop Cream Pearl An...
4,jahannamartinez,jahannamartinez,https://www.etsy.com/shop/jahannamartinez/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 14, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[Unique Gifts For Nurses, Headbands For Women,..."
5,TrueNorthCollection,TrueNorthCollection,https://www.etsy.com/shop/TrueNorthCollection/...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Yoga Headband - Workout Headband - Fitness He...
6,avtboutique,avtboutique,https://www.etsy.com/shop/avtboutique/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Boot socks - women leg warmers - boot cuffs -...
7,FangirlCreation,FangirlCreation,https://www.etsy.com/shop/FangirlCreation/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 13, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 4...","[The Harley Quinn Inspired Bow, The Speedy Ins..."
8,Princessory,Princessory,https://www.etsy.com/shop/Princessory/reviews,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[Coming Home Outfit Girl, Baby Girl Outfit, Ba..."
9,tanyaslittleshop,tanyaslittleshop,https://www.etsy.com/shop/tanyaslittleshop/rev...,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...","[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[Dainty White Pip Circlet, Bridesmaid Flower C..."


In [109]:
shopdata = shop_df

In [111]:
del shopdata['review_source']

In [112]:
shopdata

Unnamed: 0,0,shop_name,review_links,review_dates,ratings,item_details
0,ThinkPinkBows,ThinkPinkBows,https://www.etsy.com/shop/ThinkPinkBows/reviews,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[LadyBug lace romper, lace petti romper, rompe..."
1,LillyBowPeep,LillyBowPeep,https://www.etsy.com/shop/LillyBowPeep/reviews,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[lavender Petti Lace Romper, baby girl clothes..."
2,CrystalEclipseCrowns,CrystalEclipseCrowns,https://www.etsy.com/shop/CrystalEclipseCrowns...,"[ on Aug 13, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Dog tooth amethyst nugget bead headband wrapp...
3,LeChaim,LeChaim,https://www.etsy.com/shop/LeChaim/reviews,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Hummingbird Necklace. Teardrop Cream Pearl An...
4,jahannamartinez,jahannamartinez,https://www.etsy.com/shop/jahannamartinez/reviews,"[ on Aug 14, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[Unique Gifts For Nurses, Headbands For Women,..."
5,TrueNorthCollection,TrueNorthCollection,https://www.etsy.com/shop/TrueNorthCollection/...,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Yoga Headband - Workout Headband - Fitness He...
6,avtboutique,avtboutique,https://www.etsy.com/shop/avtboutique/reviews,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...",[Boot socks - women leg warmers - boot cuffs -...
7,FangirlCreation,FangirlCreation,https://www.etsy.com/shop/FangirlCreation/reviews,"[ on Aug 13, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 4...","[The Harley Quinn Inspired Bow, The Speedy Ins..."
8,Princessory,Princessory,https://www.etsy.com/shop/Princessory/reviews,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[Coming Home Outfit Girl, Baby Girl Outfit, Ba..."
9,tanyaslittleshop,tanyaslittleshop,https://www.etsy.com/shop/tanyaslittleshop/rev...,"[ on Aug 15, 2016\n , on A...","[Search, 5 out of 5 stars, 5 out of 5 stars, 5...","[Dainty White Pip Circlet, Bridesmaid Flower C..."


In [113]:
shopdata.to_csv(path_or_buf = "shop_data")

UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 0: ordinal not in range(128)