Simple cleaning for initial exploration.

1. Convert all letters to lowercase, remove punctuation.
2. Keep only ID fields, organisation name fields, and Postcode fields

In [7]:
import os, sys, json
import numpy as np
import pandas as pd
import re

In [8]:
def move_working_dir_to_repo_root(repo_name="orgsync"):
    current_dir = os.getcwd()
    #conver to lowercase
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")

Current working directory:  c:\Users\dec2g\GitHub\OrgSync


In [9]:
cordis_orgs_fp7 = "data/raw/all_scraped/cordis/2024_07/FP7/organization.json"

cordis_orgs_horizon = "data/raw/all_scraped/cordis/2024_07/Horizon 2020/organization.json"
cordis_orgs_horizon_eu = "data/raw/all_scraped/cordis/2024_07/Horizon Europe/organization.json"

gtr_orgs = "data/raw/all_scraped/gtr/scraped/2024_07/organisations.json"
gtr_orgs_2 = "data/raw/all_scraped/gtr/scraped/2024_07/organisations_2.json"

cordis_orgs = [cordis_orgs_fp7, cordis_orgs_horizon, cordis_orgs_horizon_eu]
gtr_orgs = [gtr_orgs] # , gtr_orgs_2] # gtr_orgs_2 is identical

Merge gtr data into one dataframe.

Merge cordis data into one dataframe.

In [10]:
org_dfs = []

for data_dir in cordis_orgs:
    with open(data_dir, 'r') as f:
        orgs = json.load(f)
        org_dfs.append(pd.DataFrame(orgs))

df_cordis_base = pd.concat(org_dfs)


In [11]:
org_dfs = []
for data_dir in gtr_orgs:
    with open(data_dir, 'r') as f:
        orgs = json.load(f)
        org_dfs.append(pd.DataFrame(orgs))

if len(org_dfs) > 1:
    df_gtr_base = pd.concat(org_dfs)
else: 
    df_gtr_base = org_dfs[0]

In [12]:
# get stats like size, cols, unique names, etc for df_gtr and df_cordis
print("GTR")
print(df_gtr_base.shape)
print(df_gtr_base.describe())
print(df_gtr_base.columns)
print(df_gtr_base['name'].nunique())

print("CORDIS")
print(df_cordis_base.shape)
print(df_cordis_base.describe())
print(df_cordis_base.columns)
print(df_cordis_base['name'].nunique())

GTR
(69067, 11)
            created
count  6.906700e+04
mean   1.704914e+12
std    1.333261e+09
min    1.704383e+12
25%    1.704710e+12
50%    1.704710e+12
75%    1.704710e+12
max    1.720391e+12
Index(['links', 'ext', 'id', 'outcomeid', 'href', 'created', 'updated', 'name',
       'regNumber', 'website', 'addresses'],
      dtype='object')
67463
CORDIS
(382461, 25)
               order     projectID
count  382461.000000  3.824610e+05
mean       11.240804  2.689151e+07
std        51.645603  4.441291e+07
min      -999.000000  1.000160e+05
25%         2.000000  3.183060e+05
50%         6.000000  7.543160e+05
75%        11.000000  1.010044e+08
max      1001.000000  1.901999e+08
Index(['SME', 'active', 'activityType', 'city', 'contactForm',
       'contentUpdateDate', 'country', 'ecContribution', 'endOfParticipation',
       'geolocation', 'name', 'netEcContribution', 'nutsCode', 'order',
       'organisationID', 'organizationURL', 'postCode', 'projectAcronym',
       'projectID', 'rcn', '

keep only relevant columns for analysis.

In [13]:
# for gtr, keep only id, name, addresses columns
df_gtr = df_gtr_base[['id', 'name', 'addresses']]

# for cordis, keep city, country, geolocation, name, nutscode, postcode, organizationURL, projectID, shortName, street columns
df_cordis = df_cordis_base[['city', 'country', 'geolocation', 'name', 'nutsCode', 'postCode', 'organizationURL', 'projectID', 'shortName', 'street']]

expand the nested data in the gtr dataset

In [14]:
df_gtr = df_gtr_base
def expand_addresses(df):
    # Function to extract values from the nested dictionary
    def extract_address_info(address_dict):
        if address_dict and 'address' in address_dict and address_dict['address']:
            return address_dict['address'][0]
        return {}

    # Apply the extraction function to the 'addresses' column
    expanded = df['addresses'].apply(extract_address_info).apply(pd.Series)
    
    # Prefix the new column names with 'address_' to avoid conflicts
    expanded = expanded.add_prefix('address_')
    
    # Concatenate the original dataframe with the expanded address data
    result = pd.concat([df.drop('addresses', axis=1), expanded], axis=1)
    
    return result

df_gtr_expand = expand_addresses(df_gtr)
df_gtr_expand.head()

Unnamed: 0,links,ext,id,outcomeid,href,created,updated,name,regNumber,website,...,address_line2,address_line3,address_line4,address_line5,address_city,address_county,address_postCode,address_region,address_country,address_type
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,...,,,,,,,NE1 8QH,North East,,MAIN_ADDRESS
1,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,53331120-0290-49FA-A513-0286A214AF7A,,http://gtr.ukri.org/gtr/api/organisations/5333...,1704709432000,,VALERANN UK LIMITED,,,...,,,,,,,MK14 6GD,South East,,MAIN_ADDRESS
2,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,77874202-2018-4677-8CFF-0868CD838659,,http://gtr.ukri.org/gtr/api/organisations/7787...,1704709432000,,Baltic Sea Cultural Centre in Gdansk,,,...,,,,,,,,,,
3,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,77908BF8-1B2D-4D26-9119-155100E8B9C5,,http://gtr.ukri.org/gtr/api/organisations/7790...,1704709432000,,Mindray,,,...,,,,,,,PE29 6FN,East of England,,MAIN_ADDRESS
4,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,7794C645-9CC7-4913-A8DC-103AE0EFDD4B,,http://gtr.ukri.org/gtr/api/organisations/7794...,1704709432000,,Democracy International,,,...,,,,,,,20814,Unknown,,MAIN_ADDRESS


In [15]:
df_gtr = df_gtr_expand[[
    "id", 
    "name", 
    "href", 
    "address_id", 
    "address_postCode", 
    "address_region", 
    # "address_country", # all nans
    # "address_type"
    ]]

In [19]:
df_gtr.head()

Unnamed: 0,id,name,href,address_id,address_postCode,address_region
0,5331B126-3AB4-4412-B56D-00E8F2796556,NEWCASTLE CITY COUNCIL,http://gtr.ukri.org/gtr/api/organisations/5331...,C20B6399-DBC6-4523-9C69-946A304A37D7,NE1 8QH,North East
1,53331120-0290-49FA-A513-0286A214AF7A,VALERANN UK LIMITED,http://gtr.ukri.org/gtr/api/organisations/5333...,AF7F7686-3DE3-475B-BCED-201E39037299,MK14 6GD,South East
2,77874202-2018-4677-8CFF-0868CD838659,Baltic Sea Cultural Centre in Gdansk,http://gtr.ukri.org/gtr/api/organisations/7787...,,,
3,77908BF8-1B2D-4D26-9119-155100E8B9C5,Mindray,http://gtr.ukri.org/gtr/api/organisations/7790...,51E6A87A-834A-4596-9054-C2D76E37A389,PE29 6FN,East of England
4,7794C645-9CC7-4913-A8DC-103AE0EFDD4B,Democracy International,http://gtr.ukri.org/gtr/api/organisations/7794...,61C06F28-0616-4787-8974-781CD5997724,20814,Unknown


remove non-uk countries, keeping orgs with no country information

In [17]:
def remove_non_uk_countries_cordis(df_cordis):
    # also keep empty country values
    uk_codes = ["UK", "GB", ""]
    # remove rows with df_cordis["country"] not in uk_codes
    df_cordis = df_cordis[df_cordis["country"].isin(uk_codes)]
    return df_cordis

df_cordis = remove_non_uk_countries_cordis(df_cordis)

In [18]:
# get value of all unique entries in df_cordis[country]
print(df_cordis['country'].unique())

['UK' '']


In [20]:
df_cordis.head()

Unnamed: 0,city,country,geolocation,name,nutsCode,postCode,organizationURL,projectID,shortName,street
6,MOOR ROW,UK,,NUCLEAR DECOMMISSIONING AUTHORITY - NDA,,CA24 3HU,http://www.nda.gov.uk,323260,NDA,Westlakes Science PArk - Herdus House
23,EXETER,UK,,SOUTH WEST TOURISM LIMITED,,EX2 5WT,http://www.swtourism.co.uk,219438,South West Tourism,Woodwater Park Pynes Hill
27,Cardiff,UK,"51.4816546,-3.1791934",WELSH GOVERNMENT,UKL22,CF10 3NQ,,219438,Welsh Government,CATHAYS PARK WELSH ASSEMBLY GOVERNMENT
37,OAKHAM,UK,,TERRASALUS LIMITED,,LE15 9EL,http://www.terrasalus.co.uk,226103,TerraSalus,CHURCH LANE 3A
46,WESTON HERTS,UK,,Ol PHARMA PARTNERS LTD,,SG4 7DP,,282558,Pharmivation,FAIRCLOUGH HALL RED SKY HOUSE


In [21]:
# create new df with columns "dataset": cordis or gtr, and column "Org" with "name" values from both
df_cordis['dataset'] = 'cordis'
df_gtr['dataset'] = 'gtr'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gtr['dataset'] = 'gtr'


In [24]:
# keep only select fields for initial analysis
df_cordis_save = df_cordis[["dataset", "projectID", "name", "shortName", "postCode", "city"]]
df_gtr_save = df_gtr[["dataset", "id", "name", "address_postCode"]]



KeyError: "['projectID', 'shortName', 'postCode'] not in index"

In [None]:
# rename columns to match
df_cordis = df_cordis.rename(columns={"projectID": "id", "shortName": "short_name", "postCode": "postcode"})
df_gtr = df_gtr.rename(columns={"address_postCode": "postcode"})

# Removed information
Cordis
* Street - could be used for postcode lookup or organisation address lookup.
* organizationURL - could be used for lookup on database. 
* Geolocation
* nutscode
GtR
* Address region - could be used for coarse postcode lookup. 
* href - contains the organisations unique identifier on the GtR database (still may have multiple entries for the same organisation)