In [1]:
import os, sys, json
import numpy as np
import pandas as pd
import re
import ast
from pprint import pprint

# Move the current working directory to the root of the repository for consistency in file paths
def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")

# Define the base path and the file paths 
org_data_paths_head = os.path.join("data", "raw", "all_scraped")
file_paths = {
    "cordis FP7": os.path.join(org_data_paths_head, "cordis/2024_07/FP7/organization.json"),
    "cordis Horizon 2020": os.path.join(org_data_paths_head, "cordis/2024_07/Horizon 2020/organization.json"),
    "cordis Horizon Europe": os.path.join(org_data_paths_head, "cordis/2024_07/Horizon Europe/organization.json"),
    "gtr": os.path.join(org_data_paths_head, "gtr/scraped/2024_07/organisations.json")
}   

Current working directory:  c:\Users\dec2g\GitHub\OrgSync


In [2]:
def json_to_df(path:str):
    with open(path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

In [3]:
def print_all_unique_vals_in_column(df, col_name):
    print("Unique values in column '{}':".format(col_name))
    unique_cols = df[col_name].unique()
    print(unique_cols)
    return unique_cols



Use the function below to verify that the included columns all have no unique values (all None).

In [4]:
def redundant_columns_gtr():
    df = json_to_df(file_paths["gtr"])
    print_all_unique_vals_in_column(df, "ext")
    print_all_unique_vals_in_column(df, "website")
    print_all_unique_vals_in_column(df, "regNumber")
    print_all_unique_vals_in_column(df, "outcomeid")

redundant_columns_gtr()

Unique values in column 'ext':
[None]
Unique values in column 'website':
[None]
Unique values in column 'regNumber':
[None]
Unique values in column 'outcomeid':
[None]


### Methods for extracting links from GtR data

In [7]:
def expand_dictionary_column(df, column_name:str):
    
    original_column = df[column_name].copy()
    # get new columns from dictionary
    new_columns = pd.json_normalize(original_column)

    df_level_one = pd.concat([new_columns, original_column], axis=1)
    return df_level_one

def map_each_dict_in_list_to_new_column(df_expanded, new_col_names):
    """
    Only works for address column
    """
    df_level_one = df_expanded
    col_of_dicts = []
    # go through each new column in level one
    for col in new_col_names:
        if type(df_level_one[col].iloc[0]) == list:
            print("true")
            # move each element of list to separate column
            list_to_cols = pd.json_normalize(df_level_one.copy()[col])
            col_of_dicts.append(list_to_cols)
    expanded_dfs = []
    for series in col_of_dicts:
        column_name = series.columns[0]
        print(f"Expanding column: {column_name}")
        expanded = expand_dictionary_column(series, column_name)
        expanded.drop(column_name, axis=1, inplace=True)
        expanded_dfs.append(expanded)

    expanded_df = pd.concat(expanded_dfs, axis=1)
    level_two = pd.concat([df_level_one, expanded_df], axis=1)
    level_two.drop(new_col_names, axis=1, inplace=True)
    return level_two

def expand_addresses(df):
    """
    Only works for address column
    """
    df_level_one = expand_dictionary_column(df, "addresses")
    new_col_names = [col for col in df_level_one.columns.to_list() if col not in original_df.columns.to_list()]
    print(new_col_names)
    df_level_one.head()
    return map_each_dict_in_list_to_new_column(df_level_one, new_col_names)

### Start with GtR data only

In [5]:
df = json_to_df(file_paths["gtr"])
# leep only the columns "links", "id", "href"
df = df[["links", "id", "href", "created"]]

df.head()




Unnamed: 0,links,id,href
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...
1,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,53331120-0290-49FA-A513-0286A214AF7A,http://gtr.ukri.org/gtr/api/organisations/5333...
2,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,77874202-2018-4677-8CFF-0868CD838659,http://gtr.ukri.org/gtr/api/organisations/7787...
3,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,77908BF8-1B2D-4D26-9119-155100E8B9C5,http://gtr.ukri.org/gtr/api/organisations/7790...
4,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,7794C645-9CC7-4913-A8DC-103AE0EFDD4B,http://gtr.ukri.org/gtr/api/organisations/7794...


In [75]:
def expand_dictionary_column(df, column_name:str):
    
    original_column = df[column_name].copy()
    # get new columns from dictionary
    new_columns = pd.json_normalize(original_column)

    df_level_one = pd.concat([new_columns, original_column], axis=1).drop(column_name, axis=1)
    df_result = pd.concat([df, df_level_one], axis=1).drop(column_name, axis=1)
    return df_result

def expand_links_gtr(df):
    #! add dropna first?
    
    print(f"original shape: {df.shape}")
    df = df[["links", "id", "href"]]
    df.rename(columns={"href": "href_org"}, inplace=True)
    df = expand_dictionary_column(df, "links") # get nested dict
    df.rename(columns={"link": "links"}, inplace=True)
    df = df.explode("links") # get lists from dictionary. 
    df.reset_index(drop=True, inplace=True)
    #? We now have a dataframe where each link dict is in a separate row, and each ID appears multiple times if it has multiple links
    # df.dropna(inplace=True) #! may not be needed?
    # df.reset_index(drop=True, inplace=True) #! may not be needed?

    # normlise to get the fields of the link dict in separate columns
    normalised = pd.json_normalize(df["links"])
    normalised.rename(columns={"href": "href_link"}, inplace=True)
    df = pd.concat([df.reset_index(drop=True), normalised.reset_index(drop=True)], axis=1)
    return df

def pprint_first_row_for_column(df, column_name):
    print(f"First row for column '{column_name}':")
    pprint(df[column_name].iloc[0])

In [76]:
df = json_to_df(file_paths["gtr"])
# remove 6000 rows from end
df = df[:-60000]
df = expand_links_gtr(df)
df.head()

original shape: (9067, 11)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"href": "href_org"}, inplace=True)


Unnamed: 0,id,href_org,links,href,rel,start,end
0,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,{'href': 'http://gtr.ukri.org/gtr/api/projects...,http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-...,PROJECT,,
1,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,{'href': 'http://gtr.ukri.org/gtr/api/projects...,http://gtr.ukri.org/gtr/api/projects/0D0F72CC-...,PROJECT,,
2,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,{'href': 'http://gtr.ukri.org/gtr/api/projects...,http://gtr.ukri.org/gtr/api/projects/0C6849FD-...,PROJECT,,
3,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,{'href': 'http://gtr.ukri.org/gtr/api/projects...,http://gtr.ukri.org/gtr/api/projects/C4059685-...,PROJECT,,
4,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,{'href': 'http://gtr.ukri.org/gtr/api/projects...,http://gtr.ukri.org/gtr/api/projects/F0E04953-...,PROJECT,,
