This project builds a Web Scraping Tool that extracts data of a table from a website, transform it into a CSV file and then save it to disc.

The repository Web_Scraping_Tool.git stores all the files of the project.

The target website to scrap data is: https://en.wikipedia.org/wiki/List_of_French_cheeses

In [None]:
#Create and activate the virtual environment for this notebook (only run it once in the shell).
#python -m venv venv
#source env/bin/activate
#%pip install -r requirements.txt

In [None]:
# The website data was previosly downloaded into the file List_of_French_cheeses
# In this cell the data of the website is prepared to be scrapped with the variable reponse.
import scrapy
import os
current_dir = os.path.abspath('')
url = os.path.join(current_dir, "List_of_French_cheeses")
with open(url) as _f:
    url_data = _f.read()

response = scrapy.http.TextResponse(url, body=url_data, encoding='utf-8')

In [None]:
# First the number of table like objects in the website is determined
len(response.xpath('//table'))

In [None]:
# All numbers are checked to find the one corresponding to the table we want to scrap.
response.xpath('//table')[0]

In [None]:
#Once the table is found it is saved in the variable table. The data from the first row printed to check how it is presented. The unnecesary '\n' are noticed. 
table = response.xpath('//table')[0].xpath('tbody')
table.xpath('tr')[1].xpath('td//text()').extract()

In [None]:
#In this loop all the data is printed with the '\n' removed to check that are the information wanted is available.
for tr in table.xpath('tr'):
    try:
        print([
            tr.xpath('td[1]//text()').extract()[0].replace('\n', ""), 
            tr.xpath('td[2]//text()').extract()[0].replace('\n', ""), 
            tr.xpath('td[3]//text()').extract()[0].replace('\n', ""),
            tr.xpath('td[4]//text()').extract()[0].replace('\n', ""),
            tr.xpath('td[5]//text()').extract()[0].replace('\n', "")])
    except:
        pass   

In [None]:
# The names of the columns and the data of the rows is saved in lists
column_names = ["Cheese", "Year_of_designated_appellation", "Producing_region", "Type_of_milk", "Designation"]
rows= []
for tr in table.xpath('tr'):
    try:
        Cheese = tr.xpath('td[1]//text()').extract()[0].replace('\n', "")
        Year_of_designated_appellation = tr.xpath('td[2]//text()').extract()[0].replace('\n', "")
        Producing_region = tr.xpath('td[3]//text()').extract()[0].replace('\n', "")
        Type_of_milk = tr.xpath('td[4]//text()').extract()[0].replace('\n', "")
        Designation = tr.xpath('td[5]//text()').extract()[0].replace('\n', "")
        rows.append([Cheese, Year_of_designated_appellation, Producing_region, Type_of_milk, Designation])
        
    except:
        pass

print(rows[0][1])
 


In [None]:
# Now persist it to disk
import csv

with open("French_Cheeses.csv", "w") as _f:
    writer = csv.writer(_f)

    # write the column names
    writer.writerow(column_names)

    # now write the rows
    writer.writerows(rows)

In [None]:
# The data is saved in a dictionary whose keys are the name of the cheeses
scrapped_data = {}
for tr in table.xpath('tr'):
    try:
        Cheese = tr.xpath('td[1]//text()').extract()[0].replace('\n', "")
        Year_of_designated_appellation = tr.xpath('td[2]//text()').extract()[0].replace('\n', "")
        Producing_region = tr.xpath('td[3]//text()').extract()[0].replace('\n', "")
        Type_of_milk = tr.xpath('td[4]//text()').extract()[0].replace('\n', "")
        Designation = tr.xpath('td[5]//text()').extract()[0].replace('\n', "")
        scrapped_data[Cheese]= [Year_of_designated_appellation, Producing_region, Type_of_milk, Designation]
    except:
        pass
print (scrapped_data['Abondance'])
print (scrapped_data['Abondance'][0])

In [None]:
# The data is checked before it is saved in a json file.
for key in scrapped_data:
    print (key, scrapped_data[key][0], scrapped_data[key][1], scrapped_data[key][2], scrapped_data[key][3])

In [None]:
# The data from the dictionary is stored in a json file
import json
json_data = json.dumps(scrapped_data)

with open("French_Cheeses.json", "w") as _f:

    json.dump(scrapped_data, _f)

In [None]:
# The data is also saved in a sqlite database and different queries are performed
import sqlite3
connection = sqlite3.connect("French_Cheeses")
cursor = connection.cursor()
cursor.execute('CREATE TABLE cheeses (id integer primary key, Cheese text, Year_of_designated_appellation text, Producing_region text, Type_of_milk text, Designation text)')
connection.commit()

In [None]:
query = 'INSERT INTO cheeses (Cheese, Year_of_designated_appellation, Producing_region, Type_of_milk, Designation)VALUES(?,?,?,?,?)'
for tr in table.xpath('tr'):
    try:
        cursor.execute(query , (tr.xpath('td[1]//text()').extract()[0].replace('\n', ""), 
                                 tr.xpath('td[2]//text()').extract()[0].replace('\n', ""), 
                                 tr.xpath('td[3]//text()').extract()[0].replace('\n', ""), 
                                 tr.xpath('td[4]//text()').extract()[0].replace('\n', ""), 
                                 tr.xpath('td[5]//text()').extract()[0].replace('\n', "")
                                )) 
        connection.commit()
    except: 
        pass



In [None]:
select_query = 'SELECT COUNT(id) from cheeses'
for i in cursor.execute(select_query):
    print(i)

In [None]:
select_query = 'SELECT * from cheeses'
for i in cursor.execute(select_query):
    print(i)

In [None]:
select_query = 'SELECT * from cheeses WHERE type_of_milk=="Goat"'
for i in cursor.execute(select_query):
    print(i)

In [None]:
select_query = 'SELECT * from cheeses WHERE Producing_region=="Burgundy"'
for i in cursor.execute(select_query):
    print(i)

In [None]:
select_query = 'SELECT * from cheeses WHERE Year_of_designated_appellation > 2010'
for i in cursor.execute(select_query):
    print(i)

In [None]:
#The connection is closed
connection.close()

In [None]:
#The connection is opened once again to perform another query
connection = sqlite3.connect("French_Cheeses")
cursor = connection.cursor()
select_query = 'SELECT * from cheeses WHERE Year_of_designated_appellation > 2010'
for i in cursor.execute(select_query):
    print(i)

In [None]:
#The connection is closed
connection.close()