## First Part: Data Retrieval

This section proposes a method to retrieve results' amount from Google Scholar. It is done using BeautifulSoup and webscraping. The method is not fully functioning though because quickly responses to requests start to have code 429 (Too Many Requests).

#### Usage Instuctions

1. Create file named as the variable *'software_names_filename'* in JSON format. There is default value of the variable which can be changed. Add an array of software names there.
2. Similarly, create file named as the variable *'sim_names_filename'* in JSON format. Add an array of simulators' names there.
2. Similarly, create file named as the variable *'data_filename'* in JSON format. Add a structure similar to given in "flight_software_sims_data_template.json".

Note: Clear the given in "flight_software_sims_data_template.json" fields.

In [None]:
# https://scholar.google.com/scholar?q=PX4&hl=en&as_sdt=0%2C5&as_ylo=2020&as_yhi=2021
from bs4 import BeautifulSoup
import requests
import re
import time
import json
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))
from src.utils import valid_years_range, valid_year
from dotenv import load_dotenv
from datetime import datetime
import serpapi
load_dotenv(override=True)

True

In [2]:
from bs4 import BeautifulSoup
import requests
import re
import time
import json
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))
from src.utils import now_timestamp
from dotenv import load_dotenv
from datetime import datetime

In [3]:
# Load required variables from .env
load_dotenv(override=True)
software_names_filepath = os.getenv("SOFT_LIST_PATH")
sim_names_filepath = os.getenv("SIM_LIST_PATH")
output_folderpath = os.getenv("OUTPUT_JSON_PATH")
from_semantic_scholar = os.getenv("FROM_SEMANTIC_SCHOLAR") == "true"

In [11]:
def get_results_amount_from_google_scholar(query: str, year: int = None) -> int:
  if year and not valid_year(year):
        return
  
  url = f"https://serpapi.com/search.json?q={"+".join(query.split(" "))}&engine=google_scholar&as_vis=1&num=1&api_key={os.getenv("SERPAPI_API_KEY")}"
  url = url + f"&as_ylo={year}&as_yhi={year}" if year else url

  response = requests.get(url)
  if response.status_code == 200:
      data = response.json()
      return data["search_information"]["total_results"]
  else: 
      print(f"Could not find url: {url}, response: {response}, {response.text}")
      print(f"Given request: '{query}'")
      return None

In [12]:
def get_results_amount_from_semantic_scholar(query: str, year: int = None) -> int:
    if year and not valid_year(year):
        return
    
    time.sleep(2)
    url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={"+".join(query.split(" "))}&limit=1"
    url = url + f"&year={year}" if year else url
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        total_results = data.get("total", 0)
        print(f"Total results for '{query} at Semantic Search': {total_results}")
        return total_results
    else:
        print(f"Request to Semantic Scholar with query '{query}' failed with status code {response.status_code}: {response.text}")
        return None


In [13]:
def get_results_amount(query: str, year: int = None):
    if from_semantic_scholar:
        return get_results_amount_from_semantic_scholar(query, year)
    else:
        return get_results_amount_from_google_scholar(query, (year, year))

In [14]:
def get_flight_software_names():
    with open(software_names_filepath) as f:
        names = json.load(f)
        print("Loaded flight software names successfully")
        return names
    
def get_simulators_names():
    with open(sim_names_filepath) as f:
        names = json.load(f)
        print("Loaded simulators names successfully")
        return names

# returns filepath of output file
def generate_output_file() -> str:
    filepath = f"{output_folderpath}/flight_soft_sims_data_{now_timestamp()}.json"
    with open(filepath, 'w+') as _: pass
    return filepath

In [15]:
def add_popularity(name: str, is_software: bool, data: dict, year: int = None):
    entity = "software" if is_software else "sim"

    overall_result = get_results_amount(name, year)
    if overall_result:
        data[f"{entity}_popularities"][name] = overall_result
        print("Added overall popularity of " + name)

    uav_result = get_results_amount(name + " uav", year)
    if uav_result:
        data[f"uav_{entity}_popularities"][name] = uav_result
        print("Added UAV popularity of " + name)
    
    else:
        print("An error occured while adding popularity of " + name)
        if not overall_result:
            print(f"Overall result of {name} not found")
        if not uav_result:
            print(f"UAV result of {name} not found")

In [9]:
def add_software_and_sim_usage(sim: str, software: str, data: dict):
    res = get_results_amount(f"UAV simulation {software} {sim}")
    if res:
        data["soft_sim_distributions"][software][sim] = res
        print(f"Added distibution of {software} with {sim}")
    else:
        print(f"Could not add distibution of {software} with {sim}")

In [16]:
def retrieve_data():
    software = get_flight_software_names()
    sims = get_simulators_names()
    output_filepath = generate_output_file()
    data = {
                "software_popularities": {},
                "sim_popularities": {},
                "uav_software_popularities": {},
                "uav_sim_popularities": {},
                "soft_sim_distributions": {}
            }
    
    for soft in software:
        add_popularity(soft, is_software=True, data=data)
        data["soft_sim_distributions"][soft] = {}

        for sim in sims:
            add_popularity(sim, is_software=False, data=data)
            add_software_and_sim_usage(sim, soft, data)

    with open(output_filepath, "w") as f:
        total = {
                "from": "Semantic Scholar" if from_semantic_scholar else "Google Scholar",
                "data": data
                }
        json.dump(total, f, indent=2)

    print(f"Saved received data to '{output_folderpath}'")
    print("End")

In [17]:
def retrieve_years_data(years_range: tuple[int, int] = None):
    software = get_flight_software_names()
    sims = get_simulators_names()
    output_filepath = generate_output_file()

    if years_range and not valid_years_range(years_range):
        return
    
    result_data = {}
    
    for year in range(years_range[0], years_range[1]):
        data = {
                    "software_popularities": {},
                    "sim_popularities": {},
                    "uav_software_popularities": {},
                    "uav_sim_popularities": {},
                    "soft_sim_distributions": {}
                }
        
        for soft in software:
            add_popularity(soft, is_software=True, data=data, year=year)

        for sim in sims:
            add_popularity(sim, is_software=False, data=data, year=year)

        result_data[f"{year}"] = data

    with open(output_filepath, "w") as f:
        total = {
                "from": "Semantic Scholar" if from_semantic_scholar else "Google Scholar",
                "data": result_data
                }
        json.dump(total, f, indent=2)

    print(f"Saved received data to '{output_folderpath}'")
    print("End")

In [143]:
retrieve_data()

Loaded flight software names successfully
Loaded simulators names successfully
Total results for 'PX4 at Semantic Search': 414
Added overall popularity of PX4
Total results for 'PX4 uav at Semantic Search': 115206
Added UAV popularity of PX4
Total results for 'Gazebo at Semantic Search': 3423
Added overall popularity of Gazebo
Total results for 'Gazebo uav at Semantic Search': 117937
Added UAV popularity of Gazebo
Total results for 'UAV simulation PX4 Gazebo at Semantic Search': 413
Added distibution of PX4 with Gazebo
Total results for 'Flightmare at Semantic Search': 5
Added overall popularity of Flightmare
Total results for 'Flightmare uav at Semantic Search': 114961
Added UAV popularity of Flightmare
Total results for 'UAV simulation PX4 Flightmare at Semantic Search': 94
Added distibution of PX4 with Flightmare
Total results for 'AirSim at Semantic Search': 387
Added overall popularity of AirSim
Total results for 'AirSim uav at Semantic Search': 115207
Added UAV popularity of AirS