In [None]:
!pip show SPARQLWrapper

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
sparqlEndpoint = SPARQLWrapper("https://agrovoc.fao.org/sparql")

In [None]:
sparqlEndpoint.setQuery('''
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX agrovoc: <http://aims.fao.org/aos/agrovoc/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX dcterms: <http://purl.org/dc/terms/>

    SELECT ?subj ?obj ?dateCreated WHERE {
      ?subj skos:prefLabel ?obj ;
            dcterms:created ?created .
      FILTER(lang(?obj) = "en")
      BIND(STR(?created) AS ?dateCreated)
      BIND(RAND() AS ?sortKey)
} ORDER BY ?sortKey
LIMIT 20
''')

In [None]:
sparqlEndpoint.setReturnFormat(JSON)

In [None]:
queryOutput = sparqlEndpoint.query().convert()

In [None]:
resultList = []

In [None]:
for result in queryOutput["results"]:
    for binding in queryOutput["results"]["bindings"]:
        conceptURI = binding["subj"]["value"]
        conceptLabel = binding["obj"]["value"].lower()
        if "(" in conceptLabel:
            conceptLabel = conceptLabel[:conceptLabel.find("(")-1]
        conceptCreationDate = binding["dateCreated"]["value"][:4]
        resultTuple = (conceptURI, conceptLabel, conceptCreationDate)
        print(conceptURI,"-", conceptLabel, "-", conceptCreationDate)
        resultList.append(resultTuple)

In [None]:
resultList

In [None]:
len(resultList)

In [None]:
baseURL = "https://agris.fao.org/agris-search/biblio.do?"

In [None]:
string = "tomatoes"

In [None]:
searchValue = ""
if len(string.split()) > 1:
    for token in string.split():
        searchValue += token + '+'
    searchValue = searchValue[:-1]
else:
    searchValue = string

In [None]:
searchValue

In [None]:
searchString = f'agrovocString=&agrovocToAdd=&agrovocToRemove=&advQuery="{searchValue}"&centerString=&centerToRemove=&onlyFullText=false&filterString=&filterToRemove=&typeString=&typeToRemove=&filterQuery=&operator=Required&field=0&typeresultsField=Publications&fromDate=0&toDate=0&country=0&lang=0&typeToAdd=0&enableField=Disable&aggregatorField=Disable'

In [None]:
searchString

In [None]:
searchURL = baseURL + searchString

In [None]:
response = requests.get(searchURL)

In [None]:
response.status_code

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(response.content, "html.parser")
numOfResultsRecord = soup.find("div", class_ = "pull-left grey-scale-1 last")

In [None]:
if "," in numOfResultsRecord.find("p").find("strong").text.split()[-1]:
    numOfResults = int(numOfResultsRecord.find("p").find("strong").text.split()[-1].replace(",", ""))
else:
    numOfResults = int(numOfResultsRecord.find("p").find("strong").text.split()[-1])

In [None]:
numOfResults

In [None]:
if (numOfResults // 10 == 1): 
    numOfIterations = 1
elif (numOfResults // 10 == 0) and (numOfResults % 10 > 0 and numOfResults % 10 < 10):
    numOfIterations = 1
else:
    if numOfResults % 10 == 0:
        numOfIterations = numOfResults // 10
    else:
        numOfIterations = (numOfResults // 10) + 1

In [None]:
numOfIterations

In [None]:
file = open ("search_results.txt", "w")

In [None]:
startIndex = 0
if numOfResults >= 10:
    iteration = 1
    response = requests.get(searchURL + "&" + "startIndexSearch=")
    soup = BeautifulSoup(response.content, "html.parser")
    resultUrls = soup.find_all("div", class_="col-md-10 col-sm-10 col-xs-12 inner")
    for resultUrl in resultUrls:
        url = resultUrl.find("a")
        file.write(url["href"] + "\n")
    iteration +=1
    while iteration <= numOfIterations:
        startIndex += 10
        response = requests.get(searchURL + "&" + "startIndexSearch=" + str(startIndex))
        soup = BeautifulSoup(response.content, "html.parser")
        resultUrls = soup.find_all("div", class_="col-md-10 col-sm-10 col-xs-12 inner")
        for resultUrl in resultUrls:
            url = resultUrl.find("a")
            file.write(url["href"] + "\n")
        iteration +=1
else:
    response = requests.get(searchURL + "&" + "startIndexSearch=")
    soup = BeautifulSoup(response.content, "html.parser")
    resultUrls = soup.find_all("div", class_="col-md-10 col-sm-10 col-xs-12 inner")
    counter = 0
    for resultUrl in resultUrls:
        if counter < numOfResultsToKeep:
            counter +=1
            url = resultUrl.find("a")
            file.write(url["href"] + "\n")
        else:
            break

In [None]:
file.close()

In [None]:
temporalDistributionDict = {}

In [None]:
with open("search_results.txt", "r") as file:
    counter_1975_to_1979 = 0
    counter_1980_to_1984 = 0
    counter_1985_to_1989 = 0
    counter_1990_to_1994 = 0
    counter_1995_to_1999 = 0
    counter_2000_to_2004 = 0
    counter_2005_to_2009 = 0
    counter_2010_to_2014 = 0
    counter_2015_to_2019 = 0
    counter_2020_to_2022 = 0
    for line in file:
        if len(line) != 0:
            url = "https://agris.fao.org/agris-search/" + line.rstrip()
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            metaElements = soup.find_all("meta", attrs = {"name": "citation_publication_date"})
            for metaElement in metaElements:
                if metaElement["content"] != "":
                    if (int(metaElement["content"]) >= 1975 and int(metaElement["content"]) <= 1979):
                        counter_1975_to_1979 += 1
                    elif (int(metaElement["content"]) >= 1980 and int(metaElement["content"]) <= 1984):
                        counter_1980_to_1984 += 1
                    elif (int(metaElement["content"]) >= 1985 and int(metaElement["content"]) <= 1989):
                        counter_1985_to_1989 += 1
                    elif (int(metaElement["content"]) >= 1990 and int(metaElement["content"]) <= 1994):
                        counter_1990_to_1994 += 1
                    elif (int(metaElement["content"]) >= 1995 and int(metaElement["content"]) <= 1999):
                        counter_1995_to_1999 += 1
                    elif (int(metaElement["content"]) >= 2000 and int(metaElement["content"]) <= 2004):
                        counter_2000_to_2004 += 1
                    elif (int(metaElement["content"]) >= 2005 and int(metaElement["content"]) <= 2009):
                        counter_2005_to_2009 += 1
                    elif (int(metaElement["content"]) >= 2010 and int(metaElement["content"]) <= 2014):
                        counter_2010_to_2014 += 1
                    elif (int(metaElement["content"]) >= 2015 and int(metaElement["content"]) <= 2019):
                        counter_2015_to_2019 += 1
                    elif (int(metaElement["content"]) >= 2020 and int(metaElement["content"]) <= 2022):
                        counter_2020_to_2022 += 1

In [None]:
file.close()

In [None]:
temporalDistributionDict["1975 to 1979"] = counter_1975_to_1979
temporalDistributionDict["1980 to 1984"] = counter_1980_to_1984
temporalDistributionDict["1985 to 1989"] = counter_1985_to_1989
temporalDistributionDict["1990 to 1994"] = counter_1990_to_1994
temporalDistributionDict["1995 to 1999"] = counter_1995_to_1999
temporalDistributionDict["2000 to 2004"] = counter_2000_to_2004
temporalDistributionDict["2005 to 2009"] = counter_2005_to_2009
temporalDistributionDict["2010 to 2014"] = counter_2010_to_2014
temporalDistributionDict["2015 to 2019"] = counter_2015_to_2019
temporalDistributionDict["2020 to 2022"] = counter_2020_to_2022

In [None]:
temporalDistributionDict

In [None]:
df = pd.DataFrame.from_dict(temporalDistributionDict.items())

In [None]:
df.columns = ["time window", "term occurrence frequency"]

In [None]:
df.style.hide_index()

In [None]:
barplot = df.plot(kind = "bar", x = "time window", title = f"Appearance of the term {searchValue} in literature", fontsize = 12, figsize = (10, 8), color=['#A0E8AF'])
plt.show()

In [None]:
fig = barplot.get_figure()
fig.savefig("image.jpg")