In [2]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import pprint
import pandas as pd
import os
import string
import re
import spacy
import pycountry
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from geopy.geocoders import Nominatim


In [None]:
!python -m spacy download en_core_web_md

In [3]:
# Load spaCy's pre-trained language model
nlp = spacy.load("en_core_web_md")

In [4]:
CITY_FILE_NAME = 'city'
COCITY_FILE_NAME = 'cocity_collab'
GEOLOCATOR = Nominatim(user_agent="isdjuiodfgdfjnjf847hn5")

In [5]:
def is_country(name):
    """Check if a name is a valid country."""
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return False

def extract_city_country(text):
    """Extract one major city and one country using NLP."""
    doc = nlp(text)
    cities = []
    country = None
    for ent in reversed(doc.ents):  # Iterate in reverse
        if ent.label_ == "GPE":
            res = is_country(ent.text)
            if res and not country:
                country = res
            elif not is_country(ent.text):
                if not ent.text.isdigit():
                    cities.append(ent.text)
    # Return the first city as the "major city" (if any) and the first country
    city = cities[0] if cities else None
    return city, country

# Utility functions
def find_similar_city(city, city_dict, threshold=90):
    """Find a similar city in city_dict using fuzzy matching."""
    if not city_dict:
        return None  # No cities to compare
    closest_match, similarity = process.extractOne(city, city_dict.keys(), scorer=fuzz.token_sort_ratio)
    return closest_match if similarity >= threshold else None


def initialize_city_file(city_file):
    """Initialize city CSV if it does not exist."""
    city_df = pd.read_csv(city_file)
    city_dict = {row["city"]: row["city_id"] for _, row in city_df.iterrows()}
    return city_df, city_dict


def get_geolocation(city, country):
    """Fetch latitude and longitude for a city."""
    geo = GEOLOCATOR.geocode(f"{city}, {country}")
    if geo:
        return geo.latitude, geo.longitude
    return None, None

def write_cocity_links(cocity_file, city_country_set, city_dict, filename):
    """Write co-city links to the cocity file."""
    with open(cocity_file, "a", newline="", encoding="utf-8") as cocity_csvfile:
        writer = csv.writer(cocity_csvfile)
        for city1, country1 in city_country_set:
            for city2, country2 in city_country_set:
                if city1 != city2:  # Avoid linking same city
                    city_id1 = city_dict[city1]
                    city_id2 = city_dict[city2]
                    writer.writerow([city_id1, city_id2, filename])

def update_city_data(city_df, city_dict, city, country, citedby_count):
    """Update city data and return the modified city_df and city_dict."""
    if city in city_dict:
        city_id = city_dict[city]
        city_df.loc[city_df["city_id"] == city_id, "citation_sum"] += int(citedby_count)
        city_df.loc[city_df["city_id"] == city_id, "p_count"] += 1
    else:
        similar_city = find_similar_city(city, city_dict)
        if similar_city:
            city_id = city_dict[similar_city]
            city_df.loc[city_df["city_id"] == city_id, "citation_sum"] += int(citedby_count)
            city_df.loc[city_df["city_id"] == city_id, "p_count"] += 1
        else:
            # No similar city found, add as a new entry
            city_id = len(city_dict) + 1
            city_dict[city] = city_id
            lat, lon = get_geolocation(city, country)
            # Ensure new_row has the correct structure
            new_row = pd.DataFrame(
                [{
                    "city_id": city_id,
                    "city": city,
                    "country": country,
                    "citation_sum": int(citedby_count),
                    "p_count": 1,
                    "lat": lat,
                    "lon": lon,
                }],
                columns=city_df.columns  # Ensure alignment with city_df structure
            )
            city_df = pd.concat([city_df, new_row], ignore_index=True)
    
    return city_df, city_dict


In [6]:
print(extract_city_country("Moscow Russia"))

('Moscow', 'Russian Federation')


In [12]:
#extract city
def extract_cityid(text, dict):
    """Extract one major city and one country using NLP."""
    doc = nlp(text)
    for ent in reversed(doc.ents):  # Iterate in reverse
        if ent.label_ == "GPE":
            if ent.text in dict:
                return dict[ent.text]
    # Return the first city as the "major city" (if any) and the first country
        # No exact match, perform fuzzy matching
    for ent in reversed(doc.ents):
        if ent.label_ == "GPE":
            # Use fuzzy matching to find the closest city
            closest_match, similarity = process.extractOne(ent.text, dict.keys())
            if similarity >= 90:
                return dict[closest_match]
            
    return None

In [17]:
import itertools


In [31]:
max_pages = 200
max_papers = 1000
current_page = 0
current_paper = 0
current_directory = os.getcwd()
start_year = 2011
end_year = 2016

map_df = pd.read_csv('../CSVs/city.csv')
city_map = dict(zip(map_df['city'], map_df['city_id']))

cocity_df = pd.read_csv('../CSVs/cocity_collab.csv')

Scrape a total of 1000 papers from the year 2011 to 2012

In [32]:
try:
    for year in range(start_year, end_year + 1):
        for page in range(1, max_pages + 1):
            if current_paper >= max_papers:
                break

            current_page = str(page)
            url = f'https://www.nature.com/nature/articles?sort=PubDate&year={year}&page={current_page}'
            
            # Handle potential HTTP request errors
            try:
                r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
                r.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch page {current_page} for year {year}: {e}")
                continue

            soup = BeautifulSoup(r.content, 'html.parser')

            for article in soup.find_all('article'):
                if current_paper >= max_papers:
                    break

                rel_link = article.find('a', {'data-track-action': 'view article'})['href']
                abs_link = 'https://nature.com' + rel_link

                # Fetch individual article page
                try:
                    r2 = requests.get(abs_link)
                    r2.raise_for_status()
                except requests.exceptions.RequestException as e:
                    print(f"Failed to fetch article: {abs_link} - {e}")
                    continue

                soup_2 = BeautifulSoup(r2.content, 'html.parser')

                # Extract affiliations and city-country pairs
                affi_info = soup_2.select('.c-article-author-affiliation__address')
                city_country_set = set()
                for affi in affi_info:
                    city_id = extract_cityid(affi.text, city_map)
                    if city_id:
                        city_country_set.add(city_id)

                # Process collaborations if multiple cities are involved
                if len(city_country_set) > 1:
                    print(f"City-country set: {city_country_set}")
                    current_paper += 1

                    # Generate all pairs and update DataFrame
                    for num1, num2 in itertools.combinations(city_country_set, 2):
                        # Ensure consistent ordering of city_id1 and city_id2
                        num1, num2 = sorted([num1, num2])

                        # Check if the connection already exists
                        row_mask = (cocity_df['city_id1'] == num1) & (cocity_df['city_id2'] == num2)
                        if row_mask.any():
                            # Increment colab_count for existing connection
                            cocity_df.loc[row_mask, 'colab_count'] += 1
                        else:
                            # Add a new connection
                            new_row = pd.DataFrame({'city_id1': [num1], 'city_id2': [num2], 'colab_count': [1]})
                            cocity_df = pd.concat([cocity_df, new_row], ignore_index=True)
            
            #save data frame to a csv file after every page
            cocity_df.to_csv('cocity_collab_added.csv', index=False)
            print(f"Year {year}, Page {current_page} processed. Total papers scraped: {current_paper}")
        if current_paper >= max_papers:
            break

    # Save the final DataFrame to a CSV file
    cocity_df.to_csv('cocity_collab_added.csv', index=False)
    print("Scraping completed and data saved to 'cocity_collab_added.csv'.")

except Exception as e:
    print(f"An error occurred: {e}")

Year 2011, Page 1 processed. Total papers scraped: 0
City-country set: {211, 31}
Year 2011, Page 2 processed. Total papers scraped: 1
Year 2011, Page 3 processed. Total papers scraped: 1
City-country set: {864, 834, 74, 587, 659, 917, 541}
City-country set: {44, 351}
City-country set: {1115, 1838}
City-country set: {98, 110, 2772, 70}
City-country set: {128, 2772}
City-country set: {43, 111}
City-country set: {89, 98, 2772}
Year 2011, Page 4 processed. Total papers scraped: 8
City-country set: {250, 2772, 111}
City-country set: {265, 250, 315, 2772}
City-country set: {1463, 834, 287}
City-country set: {111, 2654, 855}
Year 2011, Page 5 processed. Total papers scraped: 12
Year 2011, Page 6 processed. Total papers scraped: 12
Year 2011, Page 7 processed. Total papers scraped: 12
Year 2011, Page 8 processed. Total papers scraped: 12
City-country set: {41, 1130, 58, 2654}
City-country set: {176, 250, 1594}
City-country set: {2656, 1541, 134, 2441, 2540, 431, 178, 2772, 1463, 126}
City-coun