In [225]:
import os
import json
import time
import math
import requests
import re

import pandas as pd
from serpapi.google_scholar_search_results import GoogleScholarSearchResults
from serpapi.google_search_results import GoogleSearchResults

SERP_API_KEY = os.environ['SERP_API_KEY']
GoogleScholarSearchResults.SERP_API_KEY = SERP_API_KEY
GoogleSearchResults.SERP_API_KEY = SERP_API_KEY
# intermediate csvs
SPECIES_FILE = "data/common_names.csv"
MAIN_NAMES_FILE = "data/main_common_names.csv"
INSECTS_FILE = "data/insects.csv"
RAW_FILE = "redlist_data/simple_summary.csv"
INPUT_FILE = "data/cleaned_common_names.csv"
INSECTS_WITH_COMMON_NAME_FILE = "data/insects_with_common_name.csv"
OUTPUT_FILE = "data/insect_search_and_scholar_counts.csv"
SEARCH_RESULT_DUMP = "data/insect_search_result_dump.csv"
TWITTER_RESULTS = "data/twitter_results.csv"
TWITTER_RESULTS_WITH_GOOGLE = "data/family_with_twitter_and_google_results.csv"

# result csvs
IUCN_INSECTS_WITH_BOOKS = "data/insect_search_and_scholar_counts_and_books.csv"
TWITTER_RESULTS_WITH_BOOKS = "data/family_with_twitter_google_search_and_books.csv"
FAMILY_GOOGLE_DUMP = "data/family_google_dump.csv"

In [None]:
# Make API calls to SERP API for Google Search and Google Scholar

def get_num_scholar_results(name):
    client = GoogleScholarSearchResults({"q": '"{}"'.format(name), "num": 20})
    data = client.get_json()
    try:
        total_results = data['search_information']['total_results']
    except KeyError:
        return 0
    if total_results is None:
        return 0
    return total_results

def get_num_cited_in_first_20_results(name):
    client = GoogleScholarSearchResults({"q": '"{}"'.format(name), "num": 20})
    data = client.get_json()
    cited_by = 0
    try:
        organic_results = data['organic_results']
    except KeyError:
        return 0
    for result in organic_results:
        try:
            cited_data = result['inline_links']['cited_by']
            cited_by += cited_data.get('total') or 0
        except KeyError:
            continue
        except Exception as e:
            print('error adding scholar cited by')
            print(e)
            continue
    return cited_by

def get_num_google_search_results(name):
    # test for nan
    if name != name:
        return name
    client = GoogleSearchResults({"q": '"{}"'.format(name), "num": 20})
    data = client.get_json()
    try:
        total_results = data['search_information']['total_results']
    except KeyError:
        return 0
    if total_results is None:
        return 0
    return total_results

# Serialize the results so that we can save them to disk in case 
# we need to redo analysis and avoid paying for more usage of the API
def get_serialized_search_results(name):
    if name != name:
        return ""
    retries = 0
    while True:
        try:
            client = GoogleSearchResults({"q": '"{}"'.format(name), "num": 20})
            search_data = client.get_json()
            serialized = json.dumps(search_data)
            return serialized
        except Exception as e:
            if retries == 0:
                print('error getting serialized search results for {}, retrying'.format(name))
                print(e)
            if retries == 5:
                raise
            time.sleep(90)
            retries += 1
            
def get_serialized_scholar_results(name):
    retries = 0
    while True:
        try:
            search_data = GoogleScholarSearchResults({"q": '"{}"'.format(name), "num": 20}).get_json()
            serialized = json.dumps(search_data)
            return serialized
        except Exception as e:
            if retries == 0:
                print('error getting serialized scholar search results for {}, retrying'.format(name))
                print(e)
            if retries == 5:
                raise
            time.sleep(90)
            retries += 1

In [None]:
# Make API calls to Phrasefinder to get absolute frequency and number of books for a given list of phrases
PF_API_KEY = os.environ['PF_API_KEY']

BATCH_ENDPOINT = "https://api.phrasefinder.io/batch"
HEADERS = {
    'X-API-Key': PF_API_KEY
}
MAX_PHRASES = 5
GARBAGE_PHRASE = 'arstarstarstarst'


def get_phrase_frequencies(names):
    queries = [{'query': name if name == name else GARBAGE_PHRASE} for name in names]
    top_level_params = {
        'corpus': 'eng-us',
        'topk': MAX_PHRASES,
        'batch': queries
    }
    response = requests.post(BATCH_ENDPOINT, headers=HEADERS, json=top_level_params)
    response_tsv = response.text
    rows = response_tsv.split("\n")
    rows = [row for row in rows if row]
    if not rows:
        return []
    phrase_frequencies = []
    i = 0
    while i < len(rows):
        line = rows[i]
        status, num_lines = line.split()
        if status != "OK":
            print('error getting book freq')
        num_lines = int(num_lines)
        absolute_frequency_of_phrase = 0
        num_books_containing_phrase = 0
        start_of_phrase = i + 1
        end_of_phrase = start_of_phrase + num_lines
        for j in range(start_of_phrase, end_of_phrase):
            chunk = rows[j]
            phrase, match_score, volume_score, year_start, year_end, phrase_id, relative_frequency = chunk.split("\t")
            absolute_frequency_of_phrase += int(match_score)
            num_books_containing_phrase += int(volume_score)
        phrase_frequency = {
            'absolute_frequency': absolute_frequency_of_phrase,
            'num_books': num_books_containing_phrase}
        phrase_frequencies.append(phrase_frequency)
        i += num_lines + 1
    return phrase_frequencies

In [None]:
# Get phrasefinder results for families
iteration = 0
header = True
for chunk in pd.read_csv(TWITTER_RESULTS_WITH_GOOGLE, chunksize=100):
    print("On iteration ", iteration)
    phrase_frequencies = get_phrase_frequencies(chunk.family)
    chunk['absolute_frequency_of_family_in_books'] = [pf['absolute_frequency'] for pf in phrase_frequencies]
    chunk['num_books_containing_family'] = [pf['num_books'] for pf in phrase_frequencies]
    chunk.to_csv(TWITTER_RESULTS_WITH_BOOKS, header=header, mode="a")
    header = False
    iteration += 1

In [None]:
# Get google search and scholar results for families
iteration = 0
header = True
for chunk in pd.read_csv(TWITTER_RESULTS, chunksize=10):
    print("On iteration ", iteration)
    dump = chunk[['family']].copy()
    dump['serialized_family_name_search_results'] = dump['family'].map(get_serialized_search_results)
    dump['serialized_family_name_scholar_results'] = dump['family'].map(get_serialized_scholar_results)
    dump.to_csv(FAMILY_GOOGLE_DUMP, header=header, mode="a")
    
    chunk['num_family_name_google_scholar_results'] = chunk['family'].map(get_num_scholar_results)
    chunk['num_family_name_cited_by_in_first_20_results'] = chunk['family'].map(get_num_cited_in_first_20_results)
    chunk['num_family_name_google_search_results'] = chunk['family'].map(get_num_google_search_results)
    chunk['num_family_name_google_search_results'] = chunk['family'].map(get_num_google_search_results)
    chunk.to_csv(TWITTER_RESULTS_WITH_GOOGLE, header=header, mode="a")
    header = False
    iteration += 1

In [None]:
# Check on output of serp for families
df = pd.read_csv(TWITTER_RESULTS_WITH_GOOGLE)
df
dump = pd.read_csv(FAMILY_GOOGLE_DUMP)
serialized = dump.iloc[0]
result = json.loads(serialized.serialized_family_name_scholar_results)
result


In [None]:
# Clean up output of google scraping
os.remove(FAMILY_GOOGLE_DUMP)
os.remove(TWITTER_RESULTS_WITH_GOOGLE)

In [None]:
# Get phrasefinder results for IUCN insects
iteration = 0
header = True
for chunk in pd.read_csv(OUTPUT_FILE, chunksize=100):
    print("On iteration ", iteration)
    scientific_name_phrase_frequencies = get_phrase_frequencies(chunk.scientificName)
    common_name_phrase_frequencies = get_phrase_frequencies(chunk.commonName)
    chunk['absolute_frequency_of_scientific_name_in_books'] = [pf['absolute_frequency'] for pf in scientific_name_phrase_frequencies]
    chunk['num_books_containing_scientific_name'] = [pf['num_books'] for pf in scientific_name_phrase_frequencies]
    chunk['absolute_frequency_of_common_name_in_books'] = [pf['absolute_frequency'] for pf in common_name_phrase_frequencies]
    chunk['num_books_containing_common_name'] = [pf['num_books'] for pf in common_name_phrase_frequencies]
    chunk.to_csv(IUCN_INSECTS_WITH_BOOKS, header=header, mode="a")
    header = False
    iteration += 1

In [None]:
# Inspect phrasefinder results for IUCN insects
df = pd.read_csv(IUCN_INSECTS_WITH_BOOKS)
with_common_name = df[df.absolute_frequency_of_common_name_in_books > 0]
with_scientific_name = df[df.absolute_frequency_of_scientific_name_in_books > 0]
len(with_common_name)

In [None]:
# Transform serialized Google search and scholar results into a format where we can get aggregate results
iteration = 0
header = False
for chunk in pd.read_csv(INSECTS_WITH_COMMON_NAME_FILE, chunksize=10):
    print("On iteration ", iteration)
    dump = chunk[['scientificName', 'commonName']].copy()
    dump['serialized_common_name_search_results'] = dump['commonName'].map(get_serialized_search_results)
    dump['serialized_scientific_name_search_results'] = dump['scientificName'].map(get_serialized_search_results)
    dump['serialized_scholar_results'] = dump['scientificName'].map(get_serialized_scholar_results)
    dump.to_csv(SEARCH_RESULT_DUMP, header=header, mode="a")
    
    chunk['num_google_scholar_results'] = chunk['scientificName'].map(get_num_scholar_results)
    chunk['num_cited_by_in_first_20_results'] = chunk['scientificName'].map(get_num_cited_in_first_20_results)
    chunk['num_common_name_google_search_results'] = chunk['commonName'].map(get_num_google_search_results)
    chunk['num_scientific_name_google_search_results'] = chunk['scientificName'].map(get_num_google_search_results)
    chunk.to_csv(OUTPUT_FILE, header=header, mode="a")
    header = False
    iteration += 1

In [None]:
# Filter down dupes from common_names file
df = pd.read_csv(MAIN_NAMES_FILE)
main = df[df.main]
grouped = df.groupby(['scientificName', 'name']).size().reset_index()[['scientificName', 'name']]
grouped.to_csv(INPUT_FILE)

names = df.scientificName
print(len(names), len(num_unique_names))

In [None]:
# Filter down simple summary of all species to just insects
df = pd.read_csv(RAW_FILE)
insects = df[df.className.eq("INSECTA")]
insects.to_csv(INSECTS_FILE)

In [None]:
# Join insects with common names and insert column to describe common name
main_common_names = pd.read_csv(MAIN_NAMES_FILE)
insects_with_common_name = pd.merge(insects, main_common_names, on='internalTaxonId', how='left', suffixes=('_insects_data', '_common_names_data'))
# drop dupe columns from main_common_names
insects_with_common_name = insects_with_common_name.loc[:, ~insects_with_common_name.columns.str.contains('_common_names_data')]
insects_with_common_name = insects_with_common_name.loc[:, ~insects_with_common_name.columns.str.contains('^Unnamed')]
insects_with_common_name.drop(['main'], axis=1, inplace=True)
insects_with_common_name.rename({'scientificName_insects_data': 'scientificName', 'name': 'commonName'}, axis=1, inplace=True)
insects_with_common_name.to_csv("insects_with_common_name.csv")

In [None]:
# Filter down to insects with common names
df = pd.read_csv(INSECTS_WITH_COMMON_NAME_FILE)
with_cn = df[~df.commonName.isnull()]
print(len(df), len(with_cn))