# Importing Data

In [1]:
# import libraries
from bs4 import BeautifulSoup
from urllib import request
import time
import random
import numpy as np
import pandas as pd
import re

In this file, I will make the previous web scraping codes into a pipelin.

1. functionget_hyperlinks will created a csv file with the name: component_name_hyperlinks.csv. This file stores all the hyperlinks of each item in the search page of url from newegg website.

   a. component_name is the name of the computer components i.e intel_cpu
   
   b. The url should be in the following form url = "https://www.newegg.com/p/pl?N=100007671%20601306860&Page={}"
   
   c. num_pages is the total number of pages to be web scraped in the searching page of the url at newegg website
   
2. 

## Web Scraping all Hyperlinks

In [2]:
def get_hyperlinks(component_name,url,num_pages):
    ## create a csv table to store all the hyperlinks of CPU
    filename = "{}_hyperlinks.csv".format(component_name)
    f = open(filename, "w", encoding='utf-8')
    headers = "price,product_detail,hyperlink\n"
    f.write(headers)
    
    
    # Scraping num_pages of my_url from newegg
    print('Scraping {} searching pages:'.format(component_name))
    
    for i in range(1, num_pages+1):
        my_url = url.format(i)
        uClient = request.urlopen(my_url)
        page_html = uClient.read()
        uClient.close()
        page_soup = BeautifulSoup(page_html, "html.parser")
        containers = page_soup.findAll("div", {"class": "item-container"})

        for container in containers:
            ## Find hyperlink that directs to the webpage of that particular product
            hyperlink = list(container.children)[3]['href']
            
            Pli=container.find('li',{"class":"price-current"})

            strong=Pli.find('strong')
            sup=Pli.find('sup')
            
            strong=strong.text if strong else ''               
            sup=sup.text if strong else ''

            price=strong+sup
            if price:
                price=price.replace(',','')
            else:
                continue
            product_detail = list(container.children)[3].img["alt"].replace(",", "  ")
            
            
            f.write(price+","+product_detail+","+hyperlink+"\n")

            time.sleep(random.random()+0.5)
        time.sleep(10+random.random())
        print(i)
    
    f.close()

## Get all specs from the hyperlink_file 

In [3]:
## return a list of specs by webscraping all the hyperlinks in the 
#  component_name+"_hyperlinks.csv" file created by get_hyperlinks function 
def get_specs(component_name):
    hyperlinks=pd.read_csv(component_name+"_hyperlinks.csv")
    
    ## all cpu specs will be stored in the list total_specs
    ## then total_specs will be stored in a csv table
    total_specs=[]
    i=0
    print('Scraping each item page of {} for specs:'.format(component_name))
    
    for i in range(len(hyperlinks['hyperlink'])):
        url=hyperlinks['hyperlink'][i]
        uClient = request.urlopen(url)
        page_html = uClient.read()
        uClient.close()
        page_soup = BeautifulSoup(page_html, "lxml")  

        Specs={'price':hyperlinks['price'][i]}
        specs = page_soup.find("div", {"id": "Specs"})
        fieldsets=specs.findAll("fieldset")
        for fieldset in fieldsets:
            dls=fieldset.findAll("dl")
            for dl in dls:
                Specs[dl.find("dt").text]=dl.find("dd").text

        total_specs.append(Specs)
        i+=1
        print(i)

        time.sleep(10+random.random())
        
    return total_specs

In [4]:
## write all the specs into a csv file called component_name+"_specs.csv"
def write_specs(total_specs,component_name):
    ## find the maximal number of header titles for the csv file
    headers=total_specs[0]
    for i in total_specs[1:]:
        for j in i.keys():
            if j not in headers:
                headers[j]=None    
    
    ## make a string file for the header
    headerstr=''
    for i in list(headers.keys()):
        headerstr=headerstr+i+','
    headerstr=headerstr+'\n'      
    
    ## We create a csv table to store all the hyperlinks of CPU
    filename = component_name+"_specs.csv"
    f = open(filename, "w", encoding='utf-8')
    f.write(headerstr)
    
    ## write specs information into the csv file
    headers_list=list(headers.keys())
    for item in total_specs:
        item_str=''
        for col in headers_list:
            item_str=item_str+str(item.get(col,'None')).replace(',',' ')+','
        item_str=item_str[:-1]
        item_str=item_str+'\n'
        f.write(item_str) 
    
    f.close()

In [5]:
## Here is the total pipeline
def pipeline(component_name,url,num_pages):
    get_hyperlinks(component_name,url,num_pages)
    total_specs=get_specs(component_name)
    write_specs(total_specs,component_name)

In [None]:
write_specs(total_specs,component_name)

In [6]:
url='https://www.newegg.com/p/pl?N=100007709%20600030348&page={}'
component_name='Nvidia_GPU'
num_pages=45 
pipeline(component_name,url,num_pages)

Scraping Nvidia_GPU searching pages:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
Scraping each item page of Nvidia_GPU for specs:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224


AttributeError: 'float' object has no attribute 'timeout'

In [8]:
case=pd.read_csv('Nvidia_GPU_hyperlinks.csv')

In [10]:
case[673:676]

Unnamed: 0,price,product_detail,hyperlink
673,54.99,EVGA 200 GeForce 210 DirectX 10.1 1GB 64-Bit D...,
674,01G-P3-1312-LR,https://www.newegg.com/evga-geforce-210-01g-p3...,
675,59.99,EVGA GeForce 210 DirectX 10.1 01G-P3-1313-KR 1...,https://www.newegg.com/evga-geforce-210-01g-p3...


In [12]:
case['product_detail'][673]

'EVGA 200 GeForce 210 DirectX 10.1 1GB 64-Bit DDR3 PCI Express 2.0 x16 HDCP Ready Low Profile Video Card'

In [13]:
case['price'][674]

' 01G-P3-1312-LR'

In [15]:
case['product_detail'][674]

'https://www.newegg.com/evga-geforce-210-01g-p3-1312-lr/p/1FT-001K-00418'