In [3]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import pprint
import pandas as pd
import os
import string
import re
import spacy
import pycountry
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from geopy.geocoders import Nominatim


In [None]:
!python -m spacy download en_core_web_md


In [2]:
# Load spaCy's pre-trained language model
nlp = spacy.load("en_core_web_md")

In [4]:
CITY_FILE_NAME = 'city'
COCITY_FILE_NAME = 'cocity'
GEOLOCATOR = Nominatim(user_agent="isdjuiodfgdfjnjf847hn5")

In [6]:
def is_country(name):
    """Check if a name is a valid country."""
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return False

def extract_city_country(text):
    """Extract one major city and one country using NLP."""
    doc = nlp(text)
    cities = []
    country = None
    for ent in reversed(doc.ents):  # Iterate in reverse
        if ent.label_ == "GPE":
            res = is_country(ent.text)
            if res and not country:
                country = res
            elif not is_country(ent.text):
                if not ent.text.isdigit():
                    cities.append(ent.text)
    # Return the first city as the "major city" (if any) and the first country
    city = cities[0] if cities else None
    return city, country

# Utility functions
def find_similar_city(city, city_dict, threshold=90):
    """Find a similar city in city_dict using fuzzy matching."""
    if not city_dict:
        return None  # No cities to compare
    closest_match, similarity = process.extractOne(city, city_dict.keys(), scorer=fuzz.token_sort_ratio)
    return closest_match if similarity >= threshold else None


def initialize_city_file(city_file):
    """Initialize city CSV if it does not exist."""
    city_df = pd.read_csv(city_file)
    city_dict = {row["city"]: row["city_id"] for _, row in city_df.iterrows()}
    return city_df, city_dict


def get_geolocation(city, country):
    """Fetch latitude and longitude for a city."""
    geo = GEOLOCATOR.geocode(f"{city}, {country}")
    if geo:
        return geo.latitude, geo.longitude
    return None, None

def write_cocity_links(cocity_file, city_country_set, city_dict, filename):
    """Write co-city links to the cocity file."""
    with open(cocity_file, "a", newline="", encoding="utf-8") as cocity_csvfile:
        writer = csv.writer(cocity_csvfile)
        for city1, country1 in city_country_set:
            for city2, country2 in city_country_set:
                if city1 != city2:  # Avoid linking same city
                    city_id1 = city_dict[city1]
                    city_id2 = city_dict[city2]
                    writer.writerow([city_id1, city_id2, filename])

def update_city_data(city_df, city_dict, city, country, citedby_count):
    """Update city data and return the modified city_df and city_dict."""
    if city in city_dict:
        city_id = city_dict[city]
        city_df.loc[city_df["city_id"] == city_id, "citation_sum"] += int(citedby_count)
        city_df.loc[city_df["city_id"] == city_id, "p_count"] += 1
    else:
        similar_city = find_similar_city(city, city_dict)
        if similar_city:
            city_id = city_dict[similar_city]
            city_df.loc[city_df["city_id"] == city_id, "citation_sum"] += int(citedby_count)
            city_df.loc[city_df["city_id"] == city_id, "p_count"] += 1
        else:
            # No similar city found, add as a new entry
            city_id = len(city_dict) + 1
            city_dict[city] = city_id
            lat, lon = get_geolocation(city, country)
            # Ensure new_row has the correct structure
            new_row = pd.DataFrame(
                [{
                    "city_id": city_id,
                    "city": city,
                    "country": country,
                    "citation_sum": int(citedby_count),
                    "p_count": 1,
                    "lat": lat,
                    "lon": lon,
                }],
                columns=city_df.columns  # Ensure alignment with city_df structure
            )
            city_df = pd.concat([city_df, new_row], ignore_index=True)
    
    return city_df, city_dict


In [7]:
print(extract_city_country("Moscow Russia"))

('Moscow', 'Russian Federation')


In [8]:

num_pages = 1
num_papers = 10
current_directory = os.getcwd()

try:
    for page in range(1, num_pages + 1):
        current_page = str(page)
        url = f'https://www.nature.com/nature/articles?sort=PubDate&year=2024&page={current_page}'
        r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})

        soup = BeautifulSoup(r.content, 'html.parser')

        for article in soup.find_all('article'):
            rel_link = article.find(
                'a', {'data-track-action': 'view article'})['href']
            abs_link = 'https://nature.com' + rel_link
            r2 = requests.get(abs_link)
            soup_2 = BeautifulSoup(r2.content, 'html.parser')
            
            city_file = f"../CSVs/{CITY_FILE_NAME}.csv"
            #! Create new city.csv if it doesn't exist
            city_df, city_dict = initialize_city_file(city_file)
            
            extracted_data = []
            affi_info = soup_2.select('.c-article-author-affiliation__address')
            city_country_set = set()
            for affi in affi_info:
                city, country = extract_city_country(affi.text)
                if city and country:
                    city_country_set.add((city, country))
            
            if(len(city_country_set))
            print(city_country_set)
            #update city data
            #write cocity links
                #Get a set of cities and countries
                #
            

        if page >= num_pages:
            break

    print("Saved all articles")

except AttributeError:
    print('Invalid entry!')

set()
set()
set()
{('New York City', 'United States')}
{('Boston', 'United States')}
{('Riyadh', 'Saudi Arabia'), ('Berkeley', 'United States'), ('Leipzig', 'Germany'), ('Chicago', 'United States')}
set()
{('Moscow', 'Russian Federation'), ('Bozeman', 'United States'), ('St. Petersburg', 'Russian Federation'), ('Piscataway', 'United States')}
{('Atlanta', 'United States'), ('Princeton', 'United States'), ('Chicago', 'United States')}
set()
{('Cape Town', 'South Africa')}
set()
set()
{('Munich', 'Germany'), ('Martinsried', 'Germany')}
{('East Lansing', 'United States')}
set()
set()
{('Epalinges', 'Switzerland')}
set()
set()
Saved all articles
