In [15]:
import json
import os
from pprint import pprint
import csv
import pandas as pd
import re
from geopy.geocoders import Nominatim
from pprint import pprint

Helper Functions

In [2]:
def safe_get(data, path, default=None):
    for key in path:
        if isinstance(data, dict):
            data = data.get(key, default)
        elif isinstance(data, list) and isinstance(key, int) and 0 <= key < len(data):
            data = data[key]
        else:
            return default
    return data

Extracting Authors' info

In [None]:
#Extract author info
data_dir = 'Project_data'
output_file = 'authors.csv'

# Use a set to track unique author IDs
unique_authors = set()

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write the header once
    writer.writerow(['at_id', 'name', 'degree'])

    # Walk through all subdirectories and files
    for root, _, files in os.walk(data_dir):
        for filename in files:
            if filename.startswith('20'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r') as file:
                    print(file_path)
                    try:
                        data = json.load(file)
                        # Access authors
                        authors = data.get('abstracts-retrieval-response', {}).get('authors', {}).get('author', [])
                        for author in authors:
                            at_id = author.get("@auid")
                            if at_id and at_id not in unique_authors:  # Avoid duplicates
                                name = f"{author.get('ce:given-name', '')} {author.get('ce:surname', '')}".strip()
                                degree = author.get('ce:degrees', 'NA')
                                writer.writerow([at_id, name, degree])
                                unique_authors.add(at_id)  # Add the ID to the set
                    except (json.JSONDecodeError, KeyError) as e:
                        print(f"Error processing file {file_path}: {e}")


Extracting Affiliations Info

In [None]:
#Extract affiliation info
data_dir = 'Sample_data'
output_file = 'affiliations.csv'
unique_affiliations = set()

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['af_id', 'name', 'organization', 'country', 'city'])
    # Walk through all subdirectories and files
    for root, _, files in os.walk(data_dir):
        for filename in files:
            if filename.startswith('20'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r') as file:
                    
                    try:
                        data = json.load(file)
                        # Access authors
                        auth_groups = data.get('abstracts-retrieval-response', {}).get('item', {}).get('bibrecord', {}).get('head', {}).get('author-group', [])

                        if isinstance(auth_groups, list):
                            for auth_group in auth_groups:
                                affiliation = auth_group.get('affiliation', {})   
                                af_id = affiliation.get("@afid")
                                if af_id and af_id not in unique_affiliations:  # Avoid duplicates
                                    org_list = affiliation.get('organization', 'NA')
                                    if isinstance(org_list, list):
                                        organization = ', '.join([org['$'] for org in org_list])
                                        name = org_list[-1]['$']
                                    elif isinstance(org_list, dict):
                                        organization = org_list.get('$', 'NA')
                                        name = organization
                                    else:
                                        organization = affiliation.get('ce:text', 'NA')
                                        name = organization
                                    country = affiliation.get('country', 'NA')
                                    city = affiliation.get('city', 'NA')
                                    writer.writerow([af_id, name, organization, country, city])
                                    unique_affiliations.add(af_id)  # Add the ID to the set
                        else:
                            affiliation = auth_groups.get('affiliation', {}) 
                            af_id = affiliation.get("@afid")
                            if af_id and af_id not in unique_affiliations:  # Avoid duplicates
                                org_list = affiliation.get('organization', 'NA')
                                if isinstance(org_list, list):
                                    organization = ', '.join([org['$'] for org in org_list])
                                    name = org_list[-1]['$']
                                elif isinstance(org_list, dict):
                                    organization = org_list.get('$', 'NA')
                                    name = organization
                                else:
                                    organization = affiliation.get('ce:text', 'NA')
                                    name = organization
                            country = affiliation.get('country', 'NA')
                            city = affiliation.get('city', 'NA')
                            writer.writerow([af_id, name, organization, country, city])
                            unique_affiliations.add(af_id)  # Add the ID to the set
                    except (json.JSONDecodeError, KeyError) as e:
                        print(f"Error processing file {file_path}: {e}")

Author to Affiliations

In [6]:
data_dir = 'Project_data'
output_file = 'author_to_affi.csv'
columns = ['pid', 'at_id', 'af_id']
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()
    # Walk through all subdirectories and files
    for root, _, files in os.walk(data_dir):
        for filename in files:
            if filename.startswith('20'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r') as file:
                    
                    try:
                        data = json.load(file)
                        auth_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], None)
                        if isinstance(auth_groups, list):
                            for auth_group in auth_groups:
                                affiliation = auth_group.get('affiliation', {})   
                                af_id = affiliation.get("@afid")
                                authors = auth_group.get('author', [])
                                for author in authors:
                                    at_id = author.get('@auid')
                                    writer.writerow({'pid': filename, 'at_id': at_id, 'af_id': af_id})
                        else:
                            affiliation = auth_groups.get('affiliation', {})
                            af_id = affiliation.get("@afid")
                            authors = auth_groups.get('author', [])
                            for author in authors:
                                at_id = author.get('@auid')
                                writer.writerow({'pid': filename, 'at_id': at_id, 'af_id': af_id})
                    except Exception as e:
                            print(f"Error processing file {file_path}: {e}")

Extract subject area data

In [26]:
data_dir = 'Project_data'
output_file = 'subject_areas.csv'

columns = [
    'subject_area_id', 'subject_area_name',
]

unique_subject = set()


with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()
    # Walk through all subdirectories and files
    for root, _, files in os.walk(data_dir):
        for filename in files:
            if filename.startswith('20'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    try:
                        data = json.load(file)
                        subject_areas_list = safe_get(data, ['abstracts-retrieval-response', 'subject-areas', 'subject-area'], None)
                        for subject in subject_areas_list:
                            subject_id = subject.get('@code', None)
                            subject_name = subject.get('$', None)
                            if subject_id not in unique_subject:
                                writer.writerow({
                                    'subject_area_id': subject_id,
                                    'subject_area_name': subject_name
                                })
                                unique_subject.add(subject_id)
                    except Exception as e:
                            print(f"Error processing file {file_path}: {e}")



Extracting Paper Data

In [4]:
def get_classification_codes(classification_list):
    """
    Processes the classification list and extracts the relevant classification code.
    
    :param classification_list: List of classifications, can be of different types.
    :return: List of classification codes.
    """
    classification_codes = []
    
    for classification in classification_list:
        if isinstance(classification, dict):
            classification_type = classification.get('@type')
            classification_data = classification.get('classification')
            
            if classification_type == 'SUBJABBR' or classification_type == 'ASJC':
                # For SUBJABBR and AJSC, classification might be a single string or a list
                if isinstance(classification_data, list):
                    classification_codes.extend([item.get('$') for item in classification_data if isinstance(item, dict)])
                else:
                    classification_codes.append(classification_data)  # Single string classification
            
            elif classification_type in ['CPXCLASS', 'FLXCLASS']:
                if isinstance(classification_data, list):
                    classification_codes.extend([item.get('classification-code') for item in classification_data if isinstance(item, dict)])
                elif isinstance(classification_data, dict):
                    classification_codes.append(classification_data.get('classification-code'))
    
    return classification_codes

In [3]:
def check_slice_or_single(data):
    #Ensures that the returned data is always a list, even if it's a single item.
    if data is None:
        return None
    if isinstance(data, list):
        return data
    return [data]

In [4]:
def handleidx(idxterms_data):
    if idxterms_data:
        if isinstance(idxterms_data, dict):  # If idxterms is a dictionary
            mainterm = idxterms_data.get('mainterm')
            if isinstance(mainterm, list):
                idxterms = [i.get('$', None) for i in mainterm]  # Extract the '$' value from each item in the list
            elif isinstance(mainterm, dict):
                idxterms = [mainterm.get('$', None)]  # If 'mainterm' is a single dict, extract the '$'
            else:
                idxterms = None
        elif isinstance(idxterms_data, list):  # If idxterms is a list
            idxterms = [i.get('$', None) for i in idxterms_data if isinstance(i, dict)]  # Loop through and extract '$'
        else:
            idxterms = None
    else:
        idxterms = None
    return idxterms

In [44]:
data_dir = './Sample_data'
output_file = 'papers.csv'

columns = [
    'pid','title', 'pub_date', 'abstract', 'language', 'ref_count',
    'citedby_count', 'corresponding_author', 'author_id', 'subject_areas_id', 'keywords',
    'idxterms', 'classification_code'
]
cnt = 0
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()
    # Walk through all subdirectories and files
    for root, _, files in os.walk(data_dir):
        for filename in files:
            if cnt == 40: break
            cnt+=1
            if filename.startswith('20'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    try:
                        data = json.load(file)
                        title = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'citation-title'], None)
                        pub_year = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'source', 'publicationdate', 'year'], None)
                        pub_month = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'source', 'publicationdate', 'month'], None)
                        pub_day = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'source', 'publicationdate', 'day'], None)
                        pub_date = f"{pub_day}/{pub_month}/{pub_year}" if pub_year and pub_month and pub_day else None
                        abstract = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'abstracts'], None)
                        language = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'citation-info', 'citation-language', '@language'], None)
                        ref_count = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'tail', 'bibliography', '@refcount'], None)
                        citedby_count = safe_get(data, ['abstracts-retrieval-response', 'coredata', 'citedby-count'], None)
                        
                        
                        authors_list = safe_get(data, ['abstracts-retrieval-response', 'authors', 'author'], None)
                        author_dict= {f"{author.get('ce:given-name', '')} {author.get('ce:surname', '')}".strip() : author.get('@auid', None) for author in authors_list if isinstance(author, dict)} if authors_list else None
                        author_id = list(author_dict.values())
                        cor_author = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'correspondence', 'person'], None)
                        ca_name = f"{safe_get(cor_author, ['ce:given-name'], '')} {safe_get(cor_author, ['ce:surname'], '')}".strip()
                        
                        coressponding_auid = author_dict.get(ca_name, None)

                        subject_areas_list = safe_get(data, ['abstracts-retrieval-response', 'subject-areas', 'subject-area'], None)
                        subject_areas_id = [subject_area.get('@code', None) for subject_area in subject_areas_list if isinstance(subject_area, dict)] if subject_areas_list else None
                        
                        authkeywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords'], None)
                        keywords = None
                        if authkeywords and 'author-keyword' in authkeywords:
                            authkeyword = authkeywords.get('author-keyword', None)
                            if(isinstance(authkeyword, dict)):
                                keywords = re.sub(r'(?<=[\s])([A-Z])', r' \1', authkeyword.get('$', None)).split('  ')
                            else:
                                keywords = [k['$'] for k in authkeyword if isinstance(k, dict)]                            
                        keywords = check_slice_or_single(keywords)
                        
                        idxterms_data = safe_get(data, ['abstracts-retrieval-response', 'idxterms'], None)
                        idxterms = handleidx(idxterms_data)
                        
                        classification_list = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'enhancement', 'classificationgroup', 'classifications'], None)
                        classification_code = get_classification_codes(classification_list)
                        classification_code = check_slice_or_single(classification_code)
                        
                        writer.writerow({
                            'pid': filename,
                            'corresponding_author': coressponding_auid,
                            'title': title,
                            'pub_date': pub_date,
                            'abstract': abstract,
                            'language': language,
                            'ref_count': ref_count,
                            'citedby_count': citedby_count,
                            'author_id': author_id,
                            'subject_areas_id': subject_areas_id,
                            'keywords': keywords,
                            'idxterms': idxterms,
                            'classification_code': classification_code
                        })
                    except Exception as e:
                            print(f"Error processing file {file_path}: {e}")



Extracting city and city network data

In [54]:
# Constants
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
CITY_FILE_NAME = 'city'
COCITY_FILE_NAME = 'cocity'
DATA_DIR = "./Project_data"
GEOLOCATOR = Nominatim(user_agent="isdjuiodfgdfjnjf847hn5")

In [35]:
x = GEOLOCATOR.geocode("Batticolo Sri Lanka").raw
if x:
    print(x)

AttributeError: 'NoneType' object has no attribute 'raw'

In [55]:
# Utility functions
def find_similar_city(city, city_dict, threshold=90):
    """Find a similar city in city_dict using fuzzy matching."""
    if not city_dict:
        return None  # No cities to compare
    closest_match, similarity = process.extractOne(city, city_dict.keys(), scorer=fuzz.token_sort_ratio)
    return closest_match if similarity >= threshold else None

def initialize_city_file(city_file):
    """Initialize city CSV if it does not exist."""
    if not os.path.exists(city_file):
        city_df = pd.DataFrame(
            columns=["city_id", "city", "country", "citation_sum", "p_count", "lat", "lon"]
        )
        return city_df, {}
    city_df = pd.read_csv(city_file)
    city_dict = {row["city"]: row["city_id"] for _, row in city_df.iterrows()}
    return city_df, city_dict

def initialize_cocity_file(cocity_file):
    """Initialize cocity CSV if it does not exist."""
    if not os.path.exists(cocity_file):
        with open(cocity_file, "w", newline="", encoding="utf-8") as cocity_csvfile:
            writer = csv.writer(cocity_csvfile)
            writer.writerow(["city_id1", "city_id2", "filename"])

def get_geolocation(city, country):
    """Fetch latitude and longitude for a city."""
    geo = GEOLOCATOR.geocode(f"{city}, {country}")
    if geo:
        return geo.latitude, geo.longitude
    return None, None

def write_cocity_links(cocity_file, city_country_set, city_dict, filename):
    """Write co-city links to the cocity file."""
    with open(cocity_file, "a", newline="", encoding="utf-8") as cocity_csvfile:
        writer = csv.writer(cocity_csvfile)
        for city1, country1 in city_country_set:
            for city2, country2 in city_country_set:
                if city1 != city2:  # Avoid linking same city
                    city_id1 = city_dict[city1]
                    city_id2 = city_dict[city2]
                    writer.writerow([city_id1, city_id2, filename])

def update_city_data(city_df, city_dict, city, country, citedby_count):
    """Update city data and return the modified city_df and city_dict."""
    if city in city_dict:
        city_id = city_dict[city]
        city_df.loc[city_df["city_id"] == city_id, "citation_sum"] += int(citedby_count)
        city_df.loc[city_df["city_id"] == city_id, "p_count"] += 1
    else:
        # similar_city = find_similar_city(city, city_dict)
        # if similar_city:
        #     city_id = city_dict[similar_city]
        #     city_df.loc[city_df["city_id"] == city_id, "citation_sum"] += int(citedby_count)
        #     city_df.loc[city_df["city_id"] == city_id, "p_count"] += 1
        # else:
            # No similar city found, add as a new entry
        city_id = len(city_dict) + 1
        city_dict[city] = city_id
        lat, lon = get_geolocation(city, country)
        # Ensure new_row has the correct structure
        new_row = pd.DataFrame(
            [{
                "city_id": city_id,
                "city": city,
                "country": country,
                "citation_sum": int(citedby_count),
                "p_count": 1,
                "lat": lat,
                "lon": lon,
            }],
            columns=city_df.columns  # Ensure alignment with city_df structure
        )
        city_df = pd.concat([city_df, new_row], ignore_index=True)
    
    return city_df, city_dict


In [None]:
# Extraction non-verbose #took 50 minutes to compute lol
for root, _, files in os.walk(DATA_DIR):
    for filename in files:
        if filename.startswith("20"):
            file_path = os.path.join(root, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)
                    # city and cocity
                    city_file = f"../CSVs/{CITY_FILE_NAME}.csv"
                    #! Create new city.csv if it doesn't exist
                    city_df, city_dict = initialize_city_file(city_file)

                    citedby_count = safe_get(
                        data,
                        ["abstracts-retrieval-response", "coredata", "citedby-count"],
                        None,
                    )
                    citedby_count = int(citedby_count) if citedby_count else 0
                    city_country_set = set()

                    affiliations = safe_get(
                        data, ["abstracts-retrieval-response", "affiliation"], None
                    )
                    # always return list or None
                    affiliation_list = check_slice_or_single(affiliations)

                    for affiliation in affiliation_list:
                        city = affiliation.get("affiliation-city", None)
                        country = affiliation.get("affiliation-country", None)
                        # print(city, country)
                        if city and country:
                            city_country_set.add((city, country))

                    # Handle I. Single City, II. Multiple Cities
                    if len(city_country_set) == 1:
                        #! Case I: Single City
                        city, country = list(city_country_set)[0]
                        city_df, city_dict = update_city_data(city_df, city_dict, city, country, citedby_count)
                    else:
                        #! Case II: Multiple Cities
                        for city, country in city_country_set:
                            city_df, city_dict = update_city_data(city_df, city_dict, city, country, citedby_count)
                        # create cocity.csv and link the cities
                        cocity_file = f"../CSVs/{COCITY_FILE_NAME}.csv"
                        initialize_cocity_file(cocity_file)
                        write_cocity_links(cocity_file, city_country_set, city_dict, filename)

                    city_df.to_csv(city_file, index=False)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

In [58]:
def geocode_country(row):
    if pd.isnull(row['lat']) and pd.isnull(row['lon']):
        geo = GEOLOCATOR.geocode(f"{row['country']}")
        if geo:
            return pd.Series({'lat': geo.latitude, 'lon': geo.longitude})
    return pd.Series({'lat': row['lat'], 'lon': row['lon']})



In [59]:
df = pd.read_csv('../CSVs/city.csv')

df[['lat', 'lon']] = df.apply(geocode_country, axis=1)
print(df[df.isnull().any(axis=1)])
#impute missing coordinate value base on country coordinates

Empty DataFrame
Columns: [city_id, city, country, citation_sum, p_count, lat, lon]
Index: []


In [61]:
print(df.to_string())
df.to_csv('city_imputed.csv', index=False)


      city_id                                city                         country  citation_sum  p_count        lat         lon
0           1                             Bangkok                        Thailand        186763    20196  13.752494  100.493509
1           2                       Nakhon Pathom                        Thailand         12503     1411  13.891842  100.016566
2           3                               Suita                           Japan          2079       99  34.759419  135.516835
3           4                           Rochester                   United States         21240      687  43.157285  -77.615214
4           5                           Riverside                   United States         17453      555  33.982495 -117.374238
5           6                           San Diego                   United States         20689      609  32.717420 -117.162772
6           7                     University Park                   United States          1917       75

Phuree Code

In [14]:
#--- Phuree---#

geolocator = Nominatim(user_agent="isdjuiodfgdfjnjf847hn5")

data_dir = './Sample_data'
output_file = '../CSVs/papers-mock.csv'

cnt = 0
for root, _, files in os.walk(data_dir):
    for filename in files:
        if cnt == 50: break
        cnt+=1
        if filename.startswith('20'):
            file_path = os.path.join(root, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    data = json.load(file)
                    
                    # Extract organization (institution) information
                    org_list = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'correspondence','affiliation','organization'], None)
                    organization = None
                    if isinstance(org_list, list):
                        organization = org_list[-1]['$'] if org_list else None
                    elif isinstance(org_list, dict):
                        organization = org_list.get('$', None)

                    # city and cocity
                    city_file = '../CSVs/city1.csv'
                    #! Create new city.csv if it doesn't exist
                    if not os.path.exists(city_file):
                        city_df = pd.DataFrame(columns=['city_id', 'city', 'country', 'citation_sum', 'p_count','lat','lon'])
                        city_dict = {}
                        # Add a mock row
                        mock_row = {
                            'city_id': 1,
                            'city': 'Mock City eiei',
                            'country': 'Mock Country eiei',
                            'citation_sum': 0,
                            'p_count': 0,
                            'lat': 0,
                            'lon': 0
                        }
                        city_df.loc[-1] = mock_row
                    else:
                        city_df = pd.read_csv(city_file)

                    city_dict = {}
                    if city_df.shape[0] != 0:
                        city_dict = {row['city']: row['city_id'] for _, row in city_df.iterrows()}

                    citedby_count = safe_get(data, ['abstracts-retrieval-response','coredata','citedby-count'], None)
                    citedby_count = int(citedby_count) if citedby_count else 0
                    # print("count",citedby_count)
                    # set (city, country)
                    city_country_set = set()
                    
                    affiliations = safe_get(data, ['abstracts-retrieval-response', 'affiliation'], None)
                    # always return list or None
                    affiliation_list = check_slice_or_single(affiliations) 
            
                    for affiliation in affiliation_list:
                        city = affiliation.get('affiliation-city', None)
                        country = affiliation.get('affiliation-country', None)
                        # print(city, country)
                        if city and country:
                            city_country_set.add((city, country))

                    # Handle I. Single City, II. Multiple Cities
                    if len(city_country_set) == 1:
                        #! Case I: Single City
                        city, country = list(city_country_set)[0]
                        
                        if city in city_dict:
                            city_id = city_dict[city] 
                            city_df.loc[city_df['city_id'] == city_id, 'citation_sum'] += int(citedby_count)
                            city_df.loc[city_df['city_id'] == city_id, 'p_count'] += 1
                        else:
                            city_id = len(city_dict) + 1
                            city_dict[city] = city_id
                            #geolocation 
                            # geo = geolocator.geocode(f"{city}, {country}")
                            lat, lon = None, None
                            # if geo:
                            #     lat, lon = geo.latitude, geo.longitude
                                
                            new_row = pd.DataFrame([{
                                'city_id': city_id,
                                'city': city,
                                'country': country,
                                'citation_sum': int(citedby_count),
                                'p_count': 1,
                                'lat': lat,
                                'lon': lon
                            }])
                            city_df = pd.concat([city_df, new_row], ignore_index=True)
                    else:
                        #! Case II: Multiple Cities
                        for city, country in city_country_set:
                            if city in city_dict:
                                city_id = city_dict[city]
                                city_df.loc[city_df['city_id'] == city_id, 'citation_sum'] += int(citedby_count)
                                city_df.loc[city_df['city_id'] == city_id, 'p_count'] += 1
                            else:
                                city_id = len(city_dict) + 1
                                city_dict[city] = city_id
                                #geolocation 
                                # geo = geolocator.geocode(f"{city}, {country}")
                                lat, lon = None, None
                                # if geo:
                                #     lat, lon = geo.latitude, geo.longitude
                                new_row = pd.DataFrame([{
                                    'city_id': city_id,
                                    'city': city,
                                    'country': country,
                                    'citation_sum': int(citedby_count),
                                    'p_count': 1,
                                    'lat': lat,
                                    'lon': lon
                                }])
                                city_df = pd.concat([city_df, new_row], ignore_index=True)

                        # create cocity.csv and link the cities
                        cocity_file = '../CSVs/cocity.csv'
                        with open(cocity_file, 'a', newline='', encoding='utf-8') as cocity_csvfile:
                            cocity_writer = csv.writer(cocity_csvfile)
                            # Write header if file is empty
                            if os.stat(cocity_file).st_size == 0:
                                cocity_writer.writerow(['city_id1', 'city_id2', 'filename'])
                            for city1, country1 in city_country_set:
                                for city2, country2 in city_country_set:
                                    if city1 != city2: # Avoid linking same city
                                        city_id1 = city_dict[city1]
                                        city_id2 = city_dict[city2]
                                        cocity_writer.writerow([city_id1, city_id2, filename])


                    city_df.to_csv(city_file, index=False)
                     
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")


  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([city_df, new_row], ignore_index=True)
  city_df = pd.concat([c