# affiliations

INDEX

In [1]:
# Libraries
import pandas as pd

# Abstract Syntax Trees
import ast

# Functions
import sys
sys.path.append('../src')
from support_clean import *

# Coordenates
from geopy.geocoders import GoogleV3

### Import affiliations_df

In [2]:
affiliations = pd.read_csv('../data/affiliations_df.csv')
affiliations.head(2)

Unnamed: 0.1,Unnamed: 0,pub_id,authors,auth_aff_list,first_auth_aff,last_auth_aff
0,0,38012702,"['Wu, Anbiao', 'Zhang, Jiyan']","['Beijing Institute of Basic Medical Sciences,...","Beijing Institute of Basic Medical Sciences, B...","Beijing Institute of Basic Medical Sciences, B..."
1,1,38012669,"['Zheng, Shengnan', 'Li, Yiquan', 'Song, Xiaom...","['Department of Pharmacology, School of Basic ...","Department of Pharmacology, School of Basic Me...","Department of Rehabilitation Medicine, The Sec..."


## Data Cleansing

In [3]:
df = affiliations.copy()

In [4]:
to_drop = ['Unnamed: 0']
display(df.shape)
df = df.drop(columns=to_drop)
display(df.shape)

(61981, 6)

(61981, 5)

In [5]:
df = df.rename(columns={'ID': 'pub_id'})
df.head(2)

Unnamed: 0,pub_id,authors,auth_aff_list,first_auth_aff,last_auth_aff
0,38012702,"['Wu, Anbiao', 'Zhang, Jiyan']","['Beijing Institute of Basic Medical Sciences,...","Beijing Institute of Basic Medical Sciences, B...","Beijing Institute of Basic Medical Sciences, B..."
1,38012669,"['Zheng, Shengnan', 'Li, Yiquan', 'Song, Xiaom...","['Department of Pharmacology, School of Basic ...","Department of Pharmacology, School of Basic Me...","Department of Rehabilitation Medicine, The Sec..."


### Create a list of affiliations

In [6]:
# Fill NaN values with empty lists
df['auth_aff_list'] = df['auth_aff_list'].fillna('[]')

In [7]:
# Transform the strings to lists again
df['auth_aff_list'] = df['auth_aff_list'].apply(ast.literal_eval)

In [8]:
# Create a row per affiliation
df_aff = df.explode('auth_aff_list')

In [11]:
df_aff['auth_aff_list']

0        Beijing Institute of Basic Medical Sciences, B...
0        Beijing Institute of Basic Medical Sciences, B...
1        Department of Pharmacology, School of Basic Me...
1        Department of Pharmacology, School of Basic Me...
1        Department of Pharmacology, School of Basic Me...
                               ...                        
61980    Department of Radiology, Liyuan Hospital, Tong...
61980    Department of Radiology, Liyuan Hospital, Tong...
61980    State Key Laboratory of Brain and Cognitive Sc...
61980    Sino-Danish College, University of Chinese Aca...
61980    Center for Excellence in Brain and Science and...
Name: auth_aff_list, Length: 374373, dtype: object

In [13]:
# Create a list of unique affiliations
display(df_aff.shape)
aff_unique = list(set(df_aff['auth_aff_list']))
len(aff_unique)

(374373, 5)

133868

In [15]:
aff_df = pd.DataFrame(aff_unique, columns=['affiliation_names'])

In [17]:
# Create a column aff_id
aff_df['aff_id'] = aff_df.index

In [18]:
from geopy.geocoders import Nominatim

In [19]:
geolocator = Nominatim(user_agent="my_geocoder")

In [29]:
location_error = []
def get_country(location):
    try:
        location_info = geolocator.geocode(location, language='en')
        if location_info:
            return location_info.address.split(",")[-1].strip()
    except:
        location_error.append(location)
        return None

In [None]:
aff_df['country'] = aff_df['affiliation_names'].apply(get_country)

In [None]:
aff_df['country']

In [None]:
location_error

In [None]:
# Create a list with all the affiliations
affiliations_list = []

for i,aff in enumerate(df['auth_aff_list']):
    j = 0
    while j < len(aff):
        affiliations_list.append(aff[j])
        j += 1

In [None]:
# Create a list with the unique affiliations
display(len(affiliations_list))
affiliations_list = list(set(affiliations_list))
display(len(affiliations_list))

In [None]:
aff_df = pd.DataFrame(affiliations_list)

In [None]:
aff_df = aff_df.rename(columns={0: 'aff_name'})
aff_df.head()

### Merge columns to have the pub_id

In [None]:
display(aff_df.shape)
aff_df = pd.merge(aff_df, df, left_on='aff_name', right_on='last_auth_aff', how='left')
display(aff_df.shape)

In [None]:
display(aff_df.shape)
aff_df = pd.merge(aff_df, df, left_on='aff_name', right_on='first_auth_aff', how='left')
display(aff_df.shape)

In [None]:
aff_df.head(2)

### Fill nan and reduce the dimensions

In [None]:
cols_to_fill = ['pub_id_x', 
                'authors_x', 
                'auth_aff_list_x',
                'first_auth_aff_x', 
                'last_auth_aff_x']

for col in cols_to_fill:
    aff_df[col].fillna(aff_df[col.replace('_x', '_y')], inplace=True)

aff_df.drop(columns=['pub_id_y', 'authors_y', 'auth_aff_list_y', 'first_auth_aff_y', 'last_auth_aff_y'], inplace=True)

In [None]:
for col in cols_to_fill:
    new_name = col.replace('_x', '')
    aff_df.rename(columns={col: new_name}, inplace=True)

In [None]:
to_drop = ['authors', 'auth_aff_list', 'first_auth_aff']
display(aff_df.shape)
aff_df = aff_df.drop(columns=to_drop)
display(aff_df.shape)

In [None]:
view_nan(aff_df)

## Determine the geographical locations of the affiliations

In [None]:
with open('../google_api_key.txt', 'r') as file:
    google_api_key = file.read().strip()

geolocator = GoogleV3(api_key=google_api_key)

In [None]:
def obtain_long_lat(loc):
    '''
    Obtain the longitude and latitude coordinates for a given location.

    Parameters:
    - location (str): The name of the location for which coordinates are desired.

    Returns:
    - Tuple[float, float]: A tuple containing the obtained longitude and latitude coordinates.
      If the location cannot be geocoded successfully, (None, None) is returned.
    '''
    try:
        location = geolocator.geocode(loc)
        return location.longitude, location.latitude
    except:
        return None, None

In [None]:
# SLOW CELL
'''
aff_df['longitude'], aff_df['latitude'] = zip(*aff_df['aff_name'].apply(obtain_long_lat))
''';

In [None]:
aff_df.to_csv('../data/aff_db.csv')

In [None]:
aff_df

In [None]:
# Rellenar los valores nulos en 'pub_id' basándose en la coincidencia entre 'aff_name' y 'auth_aff_list'
# aff_df['pub_id'] = aff_df.apply(lambda row: df.loc[df['auth_aff_list'].apply(lambda x: row['aff_name'] in x), 'other_data'].iloc[0] if pd.isnull(row['pub_id']) else row['pub_id'], axis=1)