## 1. Open GtR Dataset and Expand Nested Fields

In [1]:
import os, sys, json
import numpy as np
import pandas as pd
import re
import ast
from pprint import pprint

def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")

Current working directory:  c:\Users\dec2g\GitHub\OrgSync


Methods

In [2]:
def json_to_df(path:str):
    with open(path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

def df_column_has_unique_values(df: pd.DataFrame, column:str, return_unique_values: bool=True): 
    """
    Prints summary of unique values in each column of the dataframe

    Printed outputs:
        - False: Column has the same value across all rows
        - True: Column has unique values
        - None: Column has unhashable type (e.g. list, dict, etc.)
        - (if return_unique_values=True), all unique values in the column printed
    """
    try:
        unique_values = df[column].unique()
        if len(unique_values) > 1:
            if return_unique_values:
                return unique_values
            else:
                return True
        else:
            return False
    except Exception as e:
        # print(e)
        return None
    
def check_columns_are_unique(df: pd.DataFrame):
    for col in df.columns:
        has_unique = df_column_has_unique_values(df, col, return_unique_values=False)
        if has_unique is not None:
            print(col, f" has unique values: {has_unique}")
        else:
            print(col, f" unhashable type: {df[col].dtype}")


def remove_columns_with_single_unique_value(df: pd.DataFrame):
    """
    Some columns have the same value across all rows. These columns are not useful for analysis.

    This function removes columns that have the same value across all rows.
    """
    for col in df.columns:
        has_unique = df_column_has_unique_values(df, col, return_unique_values=False)
        if has_unique is not None and not has_unique:
            df.drop(col, axis=1, inplace=True)
    return df


    



    


In [3]:
def expand_columns_containing_dictionaries(df, column_name:str):
    """
    Expand columns that contain nested dictionaries into separate columns.

    Structure of the nested dictionary:
    {
        key1:
            [
                {key2: value2},
                ...,
                {keyn: valuen}
            ]
    }

    We will ignore key1, and extract the values of key2, ..., keyn into separate columns.
    """
    df_dicts = df[[column_name]].copy()
    series = df_dicts[column_name]

    def get_series_type(series):
        column_values_type = series.apply(lambda x: type(x)).unique()
        if len(column_values_type) > 1:
            print(f"Column {column_name} contains multiple types: {column_values_type}")
            return None
        else:
            return column_values_type[0]

    def extract_series(df_dicts, column_name):
        new_columns = []
        series = df_dicts[column_name]
        series_type = get_series_type(series)
        if series_type[0] == dict:
            # map each key to a new column and values to the new column
            for key in series[0].keys():
                df_dicts[key] = series.apply(lambda x: x.get(key))
                new_columns.append(key)
        
        if series_type[0] == list:
            all_dicts = all(series.apply(lambda x: all([type(y) == dict for y in x])).to_list())
            if not all_dicts:
                print(f"Column {column_name} contains a list of non-dictionary values.")
                return df_dicts
            else: 
                df_dicts_exploded = df_dicts.explode(column_name)
             
        return df_dicts

            

In [4]:
# raw gtr data
gtr_orgs = "data/raw/all_scraped/gtr/scraped/2024_07/organisations.json"
df = json_to_df(gtr_orgs)
df.head()


Unnamed: 0,links,ext,id,outcomeid,href,created,updated,name,regNumber,website,addresses
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'..."
1,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,53331120-0290-49FA-A513-0286A214AF7A,,http://gtr.ukri.org/gtr/api/organisations/5333...,1704709432000,,VALERANN UK LIMITED,,,"{'address': [{'links': None, 'ext': None, 'id'..."
2,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,77874202-2018-4677-8CFF-0868CD838659,,http://gtr.ukri.org/gtr/api/organisations/7787...,1704709432000,,Baltic Sea Cultural Centre in Gdansk,,,{'address': []}
3,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,77908BF8-1B2D-4D26-9119-155100E8B9C5,,http://gtr.ukri.org/gtr/api/organisations/7790...,1704709432000,,Mindray,,,"{'address': [{'links': None, 'ext': None, 'id'..."
4,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,7794C645-9CC7-4913-A8DC-103AE0EFDD4B,,http://gtr.ukri.org/gtr/api/organisations/7794...,1704709432000,,Democracy International,,,"{'address': [{'links': None, 'ext': None, 'id'..."


In [5]:
print(type(df["links"].iloc[0]))

pprint(df["links"].iloc[1])

print(type(df["addresses"].iloc[0]))

pprint(df["addresses"].iloc[1])


<class 'dict'>
{'link': [{'end': None,
           'href': 'http://gtr.ukri.org/gtr/api/projects/B012B0D9-EEBC-414F-BA4A-8AF2B1898477',
           'otherAttributes': {},
           'rel': 'PROJECT',
           'start': None},
          {'end': None,
           'href': 'http://gtr.ukri.org/gtr/api/projects/B0359A89-AE42-4B13-8B85-21671966E3F6',
           'otherAttributes': {},
           'rel': 'PROJECT',
           'start': None},
          {'end': None,
           'href': 'http://gtr.ukri.org/gtr/api/persons/951DAD79-185A-4009-B3AE-1717EC2AF063',
           'otherAttributes': {},
           'rel': 'EMPLOYEE',
           'start': None}]}
<class 'dict'>
{'address': [{'city': None,
              'country': None,
              'county': None,
              'created': 1720391292000,
              'ext': None,
              'href': None,
              'id': 'AF7F7686-3DE3-475B-BCED-201E39037299',
              'line1': None,
              'line2': None,
              'line3': None,
        

# we want to blow up the links and addresses columns
1. Input: pandas series with dictionary containing list of dictionaries
   1. Step 1: Series of dictionaries containing lists of dictionaries
      1. Each row is: {"key": [{dict}, ... {dict}]}
      2. We want to create a new for each key in the top level dictionary, with column name = key, and rows = list of dictionaries
   2. Step 2: Series of lists containing dictionaries
      1. Each row is [{dict, ..., dict}]
         1. Addresses:
            1. For addresses, we can just create new columns for each of the dicts in the list, each dict is a separate field.
         2. Links:
            1. Multiple dictionaries in the list with the same fields linking to different publications etc.... So can't just map to new columns.  
   3. Step 3: Dictionary of values
      1. Each row is {"key": value}
      2. Transform the series of dictionaries into a dataframe with column name = key and rows = values


**For addresses:**
* next just remove any columns that are all Nans

In [6]:
original_df = df.copy()
original_column = df["addresses"].copy()
original_column.head()

0    {'address': [{'links': None, 'ext': None, 'id'...
1    {'address': [{'links': None, 'ext': None, 'id'...
2                                      {'address': []}
3    {'address': [{'links': None, 'ext': None, 'id'...
4    {'address': [{'links': None, 'ext': None, 'id'...
Name: addresses, dtype: object

In [7]:
# step 1: create new columns from keys in dictionary and values in dictionary
# expand dictionary into separate columns
def expand_dictionary_column(df, column_name:str):
    
    original_column = df[column_name].copy()
    # get new columns from dictionary
    new_columns = pd.json_normalize(original_column)

    df_level_one = pd.concat([new_columns, original_column], axis=1)
    return df_level_one

def map_each_dict_in_list_to_new_column(df_expanded, new_col_names):
    """
    Only works for address column
    """
    df_level_one = df_expanded
    col_of_dicts = []
    # go through each new column in level one
    for col in new_col_names:
        if type(df_level_one[col].iloc[0]) == list:
            print("true")
            # move each element of list to separate column
            list_to_cols = pd.json_normalize(df_level_one.copy()[col])
            col_of_dicts.append(list_to_cols)
    expanded_dfs = []
    for series in col_of_dicts:
        column_name = series.columns[0]
        print(f"Expanding column: {column_name}")
        expanded = expand_dictionary_column(series, column_name)
        expanded.drop(column_name, axis=1, inplace=True)
        expanded_dfs.append(expanded)

    expanded_df = pd.concat(expanded_dfs, axis=1)
    level_two = pd.concat([df_level_one, expanded_df], axis=1)
    level_two.drop(new_col_names, axis=1, inplace=True)
    return level_two

def expand_addresses(df):
    """
    Only works for address column
    """
    df_level_one = expand_dictionary_column(df, "addresses")
    new_col_names = [col for col in df_level_one.columns.to_list() if col not in original_df.columns.to_list()]
    print(new_col_names)
    df_level_one.head()
    return map_each_dict_in_list_to_new_column(df_level_one, new_col_names)

df_level_one = expand_dictionary_column(df, "addresses")
new_col_names = [col for col in df_level_one.columns.to_list() if col not in original_df.columns.to_list()]
print(new_col_names)
df_level_one.head()
df_level_two = map_each_dict_in_list_to_new_column(df_level_one, new_col_names)
df_level_two.head()


['address']
true
Expanding column: 0


Unnamed: 0,addresses,links,ext,id,outcomeid,href,created,updated,line1,line2,line3,line4,line5,city,county,postCode,region,country,type
0,"{'address': [{'links': None, 'ext': None, 'id'...",,,C20B6399-DBC6-4523-9C69-946A304A37D7,,,1720391000000.0,,,,,,,,,NE1 8QH,North East,,MAIN_ADDRESS
1,"{'address': [{'links': None, 'ext': None, 'id'...",,,AF7F7686-3DE3-475B-BCED-201E39037299,,,1720391000000.0,,,,,,,,,MK14 6GD,South East,,MAIN_ADDRESS
2,{'address': []},,,,,,,,,,,,,,,,,,
3,"{'address': [{'links': None, 'ext': None, 'id'...",,,51E6A87A-834A-4596-9054-C2D76E37A389,,,1720391000000.0,,,,,,,,,PE29 6FN,East of England,,MAIN_ADDRESS
4,"{'address': [{'links': None, 'ext': None, 'id'...",,,61C06F28-0616-4787-8974-781CD5997724,,,1720391000000.0,,,,,,,,,20814,Unknown,,MAIN_ADDRESS


For Links: We want to get the list of dicts as new rows as there are multiple links for a given entry. 

In [8]:
original_df = df.copy()
original_column = df["links"].copy()
original_column.head()

0    {'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
1    {'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
2    {'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
3    {'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
4    {'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
Name: links, dtype: object

Lots of dictionaries for one row of the original dataframe, each with the same fields...

In [9]:
links_level_one = expand_dictionary_column(df, "links")
# print first row
pprint(links_level_one["link"].iloc[0])
links_level_one.head()

[{'end': None,
  'href': 'http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-B732-4218-B0E3-4FFBF3DDC906',
  'otherAttributes': {},
  'rel': 'PROJECT',
  'start': None},
 {'end': None,
  'href': 'http://gtr.ukri.org/gtr/api/projects/0D0F72CC-0163-47CE-A462-5FDDBA4C1C38',
  'otherAttributes': {},
  'rel': 'PROJECT',
  'start': None},
 {'end': None,
  'href': 'http://gtr.ukri.org/gtr/api/projects/0C6849FD-CA2D-49A9-80D4-75F710980208',
  'otherAttributes': {},
  'rel': 'PROJECT',
  'start': None},
 {'end': None,
  'href': 'http://gtr.ukri.org/gtr/api/projects/C4059685-9263-44E6-B89A-D8609FCC1360',
  'otherAttributes': {},
  'rel': 'PROJECT',
  'start': None},
 {'end': None,
  'href': 'http://gtr.ukri.org/gtr/api/projects/F0E04953-58F2-46FB-9837-3C341A3D3165',
  'otherAttributes': {},
  'rel': 'PROJECT',
  'start': None},
 {'end': None,
  'href': 'http://gtr.ukri.org/gtr/api/projects/D9A401D9-6D91-4DA6-9012-7137774B1DD8',
  'otherAttributes': {},
  'rel': 'PROJECT',
  'start': None},
 {'end': N

Unnamed: 0,link,links
0,[{'href': 'http://gtr.ukri.org/gtr/api/project...,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
1,[{'href': 'http://gtr.ukri.org/gtr/api/project...,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
2,[{'href': 'http://gtr.ukri.org/gtr/api/project...,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
3,[{'href': 'http://gtr.ukri.org/gtr/api/project...,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...
4,[{'href': 'http://gtr.ukri.org/gtr/api/project...,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...


For now lets just expand out into rows so that we have multiple rows per project

In [10]:
df = original_df.copy()
df_link_expanded = expand_dictionary_column(df, "links").drop("links", axis=1)
df = pd.concat([df, df_link_expanded], axis=1)
# df['link'] = df['link'].apply(ast.literal_eval)
df_exploded = df.explode('link')

df_exploded.head()
print(len(df_exploded))
print(len(df))


671161
69067


In [11]:
df_exploded.head()

Unnamed: 0,links,ext,id,outcomeid,href,created,updated,name,regNumber,website,addresses,link
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'...",{'href': 'http://gtr.ukri.org/gtr/api/projects...
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'...",{'href': 'http://gtr.ukri.org/gtr/api/projects...
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'...",{'href': 'http://gtr.ukri.org/gtr/api/projects...
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'...",{'href': 'http://gtr.ukri.org/gtr/api/projects...
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'...",{'href': 'http://gtr.ukri.org/gtr/api/projects...


In [12]:
df_id_links = df_exploded[["id", "link", "href"]].copy()
# reset index
df_id_links.reset_index(drop=True, inplace=True)
df_id_links.head()

# remove nans
df_id_links.dropna(inplace=True)
print(len(df_id_links))




670709


In [13]:
df_id_links.head()
len(df_id_links)

670709

In [14]:
df_id_links = df_exploded[["id", "link", "href"]].copy()
# rename href to href_org
df_id_links.rename(columns={"href": "href_org"}, inplace=True)
print(len(df_id_links))
df_id_links.dropna(inplace=True)
print(len(df_id_links))
normalised = pd.json_normalize(df_id_links["link"])
print(len(normalised))
df_id_links = pd.concat([df_id_links[["id", "href_org"]].reset_index(drop=True), normalised.reset_index(drop=True)], axis=1)
print(len(df_id_links))
# drop columns "index" and "index"

671161
670709
670709
670709


In [15]:

# save
df_id_links.to_csv("data/processed/gtr_organisations_links.csv", index=False)

print(df_id_links["href_org"].iloc[0])

http://gtr.ukri.org/gtr/api/organisations/5331B126-3AB4-4412-B56D-00E8F2796556


In [16]:
df_id_links.head(50)


Unnamed: 0,id,href_org,href,rel,start,end
0,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-...,PROJECT,,
1,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/0D0F72CC-...,PROJECT,,
2,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/0C6849FD-...,PROJECT,,
3,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/C4059685-...,PROJECT,,
4,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/F0E04953-...,PROJECT,,
5,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/D9A401D9-...,PROJECT,,
6,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/EC918CE3-...,PROJECT,,
7,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/ADB18319-...,PROJECT,,
8,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/DCFE5B2E-...,PROJECT,,
9,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/1E0BB71C-...,PROJECT,,


# On GTR...
Each Organisation href links to a list of associated projects and employees.
* https://gtr.ukri.org/gtr/api/organisations/5331B126-3AB4-4412-B56D-00E8F2796556

* Links to EMPLOYEE (e.g. https://gtr.ukri.org/gtr/api/persons/0CB4A538-AC14-4394-A10F-C9F955033EF3) give first name and last name. 
* Links to PROJECT give a lot of fields including "RESEARCH_PER" which gives the person associated with the project.
  * Several fields ending with PER for people associated
  * Several fields ending with ORG for organisations associated
* COLLABORATION field is an outcome thatcontains further related organisation information. 
* PARTICIPATION doesn't seem to have much
* FURTHER FUNDING lists the sector of the orgnaisation, could be useful 
* PUBLICATION field is useful - contains link to publication URL that can be used to match additional fields (organisation and authors), doi, and author name. 
  * https://gtr.ukri.org/gtr/api/outcomes/publications/0B3A64BD-9D0B-42DC-96EC-4BA34632C9C7
  *  

Projects have three additional sections
* Organisations
* People

Employee
  

In [17]:
# convert to nested dictionary
# {id: rows from link with the same id}
id_links = df_id_links.groupby("id") # .apply(lambda x: x.to_dict(orient='records')).to_dict()
id_links.head()

Unnamed: 0,id,href_org,href,rel,start,end
0,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-...,PROJECT,,
1,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/0D0F72CC-...,PROJECT,,
2,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/0C6849FD-...,PROJECT,,
3,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/C4059685-...,PROJECT,,
4,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,http://gtr.ukri.org/gtr/api/projects/F0E04953-...,PROJECT,,
...,...,...,...,...,...,...
670581,DEAD23AB-66BF-4D61-BC10-2CC8BF055693,http://gtr.ukri.org/gtr/api/organisations/DEAD...,http://gtr.ukri.org/gtr/api/projects/EE39AFFE-...,PROJECT,,
670582,DEAD23AB-66BF-4D61-BC10-2CC8BF055693,http://gtr.ukri.org/gtr/api/organisations/DEAD...,http://gtr.ukri.org/gtr/api/projects/1717F5F9-...,PROJECT,,
670583,DEAD23AB-66BF-4D61-BC10-2CC8BF055693,http://gtr.ukri.org/gtr/api/organisations/DEAD...,http://gtr.ukri.org/gtr/api/projects/E6A07D25-...,PROJECT,,
670707,0D4FDCE2-32C8-4FC1-A582-331BEE2B5C1F,http://gtr.ukri.org/gtr/api/organisations/0D4F...,http://gtr.ukri.org/gtr/api/projects/8D402FF2-...,PROJECT,,


In [18]:
# lets just look at the link column for now by converting to list
link_list = df_exploded["link"].to_list()
for link_dict in link_list[:5]:
    pprint(link_dict)

{'end': None,
 'href': 'http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-B732-4218-B0E3-4FFBF3DDC906',
 'otherAttributes': {},
 'rel': 'PROJECT',
 'start': None}
{'end': None,
 'href': 'http://gtr.ukri.org/gtr/api/projects/0D0F72CC-0163-47CE-A462-5FDDBA4C1C38',
 'otherAttributes': {},
 'rel': 'PROJECT',
 'start': None}
{'end': None,
 'href': 'http://gtr.ukri.org/gtr/api/projects/0C6849FD-CA2D-49A9-80D4-75F710980208',
 'otherAttributes': {},
 'rel': 'PROJECT',
 'start': None}
{'end': None,
 'href': 'http://gtr.ukri.org/gtr/api/projects/C4059685-9263-44E6-B89A-D8609FCC1360',
 'otherAttributes': {},
 'rel': 'PROJECT',
 'start': None}
{'end': None,
 'href': 'http://gtr.ukri.org/gtr/api/projects/F0E04953-58F2-46FB-9837-3C341A3D3165',
 'otherAttributes': {},
 'rel': 'PROJECT',
 'start': None}


In [19]:
# remove nans from link_list
link_list = [x for x in link_list if type(x) == dict]
types_bool = [x for x in link_list if not type(x) == dict]
print(types_bool)

[]


In [20]:


# 

In [21]:
df_from_list = pd.DataFrame(link_list)
df_from_list.head()sa


SyntaxError: invalid syntax (1771813794.py, line 2)

In [148]:
print(df_from_list["rel"].unique())

['PROJECT' 'EMPLOYEE']


In [131]:
from collections import defaultdict

def find_unique_values(list_of_dicts):    
    for d in list_of_dicts:
        for key, value in d.items():
            result[key].add(value)
    # Convert sets to lists for the final output
    return {k: list(v) for k, v in result.items()}

unique_values = find_unique_values(link_list)

AttributeError: 'list' object has no attribute 'add'

In [184]:
df_exploded.head()
print(df_exploded["link"].iloc[0])
print(df_exploded["link"].iloc[1])
print(df_exploded["link"].iloc[2])
print(df_exploded["link"].iloc[3])
print(df_exploded["link"].iloc[4])
print(df_exploded["link"].iloc[5])
print(df_exploded["link"].iloc[6])
print(df_exploded["link"].iloc[7])
print(df_exploded["link"].iloc[8])
print(df_exploded["link"].iloc[9])
print(df_exploded["link"].iloc[10])
print(df_exploded["link"].iloc[11])
print(df_exploded["link"].iloc[12])
print(df_exploded["link"].iloc[13])
print(df_exploded["link"].iloc[14])
print(df_exploded["link"].iloc[15])
print(df_exploded["link"].iloc[16])
print(df_exploded["link"].iloc[17])
print(df_exploded["link"].iloc[18])
print(df_exploded["link"].iloc[19])
print(df_exploded["link"].iloc[20])
print(df_exploded["link"].iloc[21])
print(df_exploded["link"].iloc[22])
print(df_exploded["link"].iloc[23])
print(df_exploded["link"].iloc[24])
print(df_exploded["link"].iloc[25])
print(df_exploded["link"].iloc[26])



{'href': 'http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-B732-4218-B0E3-4FFBF3DDC906', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}
{'href': 'http://gtr.ukri.org/gtr/api/projects/0D0F72CC-0163-47CE-A462-5FDDBA4C1C38', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}
{'href': 'http://gtr.ukri.org/gtr/api/projects/0C6849FD-CA2D-49A9-80D4-75F710980208', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}
{'href': 'http://gtr.ukri.org/gtr/api/projects/C4059685-9263-44E6-B89A-D8609FCC1360', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}
{'href': 'http://gtr.ukri.org/gtr/api/projects/F0E04953-58F2-46FB-9837-3C341A3D3165', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}
{'href': 'http://gtr.ukri.org/gtr/api/projects/D9A401D9-6D91-4DA6-9012-7137774B1DD8', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}
{'href': 'http://gtr.ukri.org/gtr/api/projects/EC918CE3-4862-4EED-884A

In [110]:
# finally, as before, blow up the link column
# def expand_link_column_dicts(df):
#     """
#     Only works for address column
#     """
#     df_level_one = expand_dictionary_column(df, "addresses")
#     new_col_names = [col for col in df_level_one.columns.to_list() if col not in original_df.columns.to_list()]
#     print(new_col_names)
#     df_level_one.head()
#     return map_each_dict_in_list_to_new_column(df_level_one, new_col_names)

df_links = expand_dictionary_column(df, "link")
df_links.head()


KeyboardInterrupt: 

In [45]:
# Step 2: (for creating rows) create new columns for each dictionary in the list of dictionaries
# collect each dictionary column in the list of dictionaries
exploded_dfs = []

# go through each new column in level one
for col in new_col_names:
    if type(df_level_one[col].iloc[0]) == list:
        # Step 2: explode the list of dictionaries into separate rows
        explode = df_level_one.copy().explode(col)[col]
        # Step 3
        exploded_dfs.append(explode)


# pprint first element
# pprint(exploded_dfs[0].iloc[0])
# for exploded_in_e
# new_columns = pd.concat(exploded_dfs, axis=1)
# new_columns.head()
        # new_columns = pd.json_normalize(df_normalised[col])

0    {'href': 'http://gtr.ukri.org/gtr/api/projects...
0    {'href': 'http://gtr.ukri.org/gtr/api/projects...
0    {'href': 'http://gtr.ukri.org/gtr/api/projects...
0    {'href': 'http://gtr.ukri.org/gtr/api/projects...
0    {'href': 'http://gtr.ukri.org/gtr/api/projects...
Name: link, dtype: object

In [4]:
def explode_iteratively(df, column_name):
    series = df[column_name]

    def get_series_type(series):
        column_values_type = series.apply(lambda x: type(x)).unique()
        if len(column_values_type) > 1:
            print(f"Column {column_name} contains multiple types: {column_values_type}")
            return None
        else:
            return column_values_type[0]
        
    series_type = get_series_type(series)

    if series_type == dict:
        

SyntaxError: incomplete input (4090969941.py, line 15)

In [10]:

df_dicts = df[["addresses"]].copy()
series = df_dicts["addresses"]
# print type of all values in the column if all same
print(series.apply(lambda x: type(x)).unique())
df_dict = df.copy()
for key in series[0].keys():
    df_dicts[key] = series.apply(lambda x: x.get(key))
df_dicts.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/orgsync.json'

In [107]:
column_name = "addresses"
df_dicts = df[[column_name]].copy()
df_dicts.head()
df_normalised = pd.concat([pd.json_normalize(df_dicts[column_name]), df_dicts], axis=1)
df_normalised.head()

drop_names = [column_name]

column_name = df_normalised.columns.to_list()[0]
df_exploded = df_normalised.explode(column_name)
df_exploded.head()

drop_names.append(column_name)

df_normalised = pd.concat([pd.json_normalize(df_exploded[column_name]), df_exploded], axis=1)
df_normalised.head()

# drop all drop names
df_normalised.drop(drop_names, axis=1, inplace=True)
df_normalised.head()

Unnamed: 0,links,ext,id,outcomeid,href,created,updated,line1,line2,line3,line4,line5,city,county,postCode,region,country,type
0,,,C20B6399-DBC6-4523-9C69-946A304A37D7,,,1720391000000.0,,,,,,,,,NE1 8QH,North East,,MAIN_ADDRESS
1,,,AF7F7686-3DE3-475B-BCED-201E39037299,,,1720391000000.0,,,,,,,,,MK14 6GD,South East,,MAIN_ADDRESS
2,,,,,,,,,,,,,,,,,,
3,,,51E6A87A-834A-4596-9054-C2D76E37A389,,,1720391000000.0,,,,,,,,,PE29 6FN,East of England,,MAIN_ADDRESS
4,,,61C06F28-0616-4787-8974-781CD5997724,,,1720391000000.0,,,,,,,,,20814,Unknown,,MAIN_ADDRESS


In [90]:
df_exploded = df_dicts.explode("address")
# df_normalised = pd.json_normalize(df_exploded["addresses"])
# df_normalised.head()
df_exploded.head()

# # if still list of dictionaries, explode again
# df_exploded = df_exploded.explode("address")
# # 
# df_exploded.head()



Unnamed: 0,addresses,address
0,"{'address': [{'links': None, 'ext': None, 'id'...","{'links': None, 'ext': None, 'id': 'C20B6399-D..."
1,"{'address': [{'links': None, 'ext': None, 'id'...","{'links': None, 'ext': None, 'id': 'AF7F7686-3..."
2,{'address': []},
3,"{'address': [{'links': None, 'ext': None, 'id'...","{'links': None, 'ext': None, 'id': '51E6A87A-8..."
4,"{'address': [{'links': None, 'ext': None, 'id'...","{'links': None, 'ext': None, 'id': '61C06F28-0..."


In [91]:
df_normalised = pd.json_normalize(df_exploded["address"])
df_normalised.head()

Unnamed: 0,links,ext,id,outcomeid,href,created,updated,line1,line2,line3,line4,line5,city,county,postCode,region,country,type
0,,,C20B6399-DBC6-4523-9C69-946A304A37D7,,,1720391000000.0,,,,,,,,,NE1 8QH,North East,,MAIN_ADDRESS
1,,,AF7F7686-3DE3-475B-BCED-201E39037299,,,1720391000000.0,,,,,,,,,MK14 6GD,South East,,MAIN_ADDRESS
2,,,,,,,,,,,,,,,,,,
3,,,51E6A87A-834A-4596-9054-C2D76E37A389,,,1720391000000.0,,,,,,,,,PE29 6FN,East of England,,MAIN_ADDRESS
4,,,61C06F28-0616-4787-8974-781CD5997724,,,1720391000000.0,,,,,,,,,20814,Unknown,,MAIN_ADDRESS


In [67]:
print(df_dicts["address"].apply(lambda x: type(x)).unique())
print(all(df_dicts["address"].apply(lambda x: all([type(y) == dict for y in x])).to_list()))
df_lists = df_dicts.explode("address")
df_normalized = pd.json_normalize(df_lists["address"])
# df_lists.head()
df_normalized.head()

[<class 'list'>]
True


Unnamed: 0,links,ext,id,outcomeid,href,created,updated,line1,line2,line3,line4,line5,city,county,postCode,region,country,type
0,,,C20B6399-DBC6-4523-9C69-946A304A37D7,,,1720391000000.0,,,,,,,,,NE1 8QH,North East,,MAIN_ADDRESS
1,,,AF7F7686-3DE3-475B-BCED-201E39037299,,,1720391000000.0,,,,,,,,,MK14 6GD,South East,,MAIN_ADDRESS
2,,,,,,,,,,,,,,,,,,
3,,,51E6A87A-834A-4596-9054-C2D76E37A389,,,1720391000000.0,,,,,,,,,PE29 6FN,East of England,,MAIN_ADDRESS
4,,,61C06F28-0616-4787-8974-781CD5997724,,,1720391000000.0,,,,,,,,,20814,Unknown,,MAIN_ADDRESS


There are two files containing the GtR data but currently these are identical, so we just need to load one of them.
* `data/raw/all_scraped/gtr/scraped/2024_07/organisations.json`
* `data/raw/all_scraped/gtr/scraped/2024_07/organisations_2.json`

In [40]:
print(df_dicts["links"][0])

{'link': [{'href': 'http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-B732-4218-B0E3-4FFBF3DDC906', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}, {'href': 'http://gtr.ukri.org/gtr/api/projects/0D0F72CC-0163-47CE-A462-5FDDBA4C1C38', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}, {'href': 'http://gtr.ukri.org/gtr/api/projects/0C6849FD-CA2D-49A9-80D4-75F710980208', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}, {'href': 'http://gtr.ukri.org/gtr/api/projects/C4059685-9263-44E6-B89A-D8609FCC1360', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}, {'href': 'http://gtr.ukri.org/gtr/api/projects/F0E04953-58F2-46FB-9837-3C341A3D3165', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}, {'href': 'http://gtr.ukri.org/gtr/api/projects/D9A401D9-6D91-4DA6-9012-7137774B1DD8', 'rel': 'PROJECT', 'start': None, 'end': None, 'otherAttributes': {}}, {'href': 'http://gtr.ukri.org/gtr/api/projects/EC918CE

In [37]:
df_ = expand_columns_containing_dictionaries(df_dicts, "links")
df_.head()

Unnamed: 0,ext,id,outcomeid,href,created,updated,name,regNumber,website,addresses,links_0
0,,5331B126-3AB4-4412-B56D-00E8F2796556,,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,,NEWCASTLE CITY COUNCIL,,,"{'address': [{'links': None, 'ext': None, 'id'...",
1,,53331120-0290-49FA-A513-0286A214AF7A,,http://gtr.ukri.org/gtr/api/organisations/5333...,1704709432000,,VALERANN UK LIMITED,,,"{'address': [{'links': None, 'ext': None, 'id'...",
2,,77874202-2018-4677-8CFF-0868CD838659,,http://gtr.ukri.org/gtr/api/organisations/7787...,1704709432000,,Baltic Sea Cultural Centre in Gdansk,,,{'address': []},
3,,77908BF8-1B2D-4D26-9119-155100E8B9C5,,http://gtr.ukri.org/gtr/api/organisations/7790...,1704709432000,,Mindray,,,"{'address': [{'links': None, 'ext': None, 'id'...",
4,,7794C645-9CC7-4913-A8DC-103AE0EFDD4B,,http://gtr.ukri.org/gtr/api/organisations/7794...,1704709432000,,Democracy International,,,"{'address': [{'links': None, 'ext': None, 'id'...",


The dataset has the following fields:
* links	
* ext	
* id	
* outcomeid	
* href	
* created	
* updated	
* name	
* regNumber	
* website	
* addresses

`links` and `addresses` are nested dictionaries and need to be expanded. 

Some columns have the same value across all rows. These columns are not useful for analysis.

Remove all columns where each row contains the same value. 

In [32]:
check_columns_are_unique(df_dicts)
df_ = remove_columns_with_single_unique_value(df_dicts)
df_.head()

links  unhashable type: object
ext  has unique values: False
id  has unique values: True
outcomeid  has unique values: False
href  has unique values: True
created  has unique values: True
updated  has unique values: False
name  has unique values: True
regNumber  has unique values: False
website  has unique values: False
addresses  unhashable type: object


Unnamed: 0,links,id,href,created,name,addresses
0,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,5331B126-3AB4-4412-B56D-00E8F2796556,http://gtr.ukri.org/gtr/api/organisations/5331...,1704709432000,NEWCASTLE CITY COUNCIL,"{'address': [{'links': None, 'ext': None, 'id'..."
1,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,53331120-0290-49FA-A513-0286A214AF7A,http://gtr.ukri.org/gtr/api/organisations/5333...,1704709432000,VALERANN UK LIMITED,"{'address': [{'links': None, 'ext': None, 'id'..."
2,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,77874202-2018-4677-8CFF-0868CD838659,http://gtr.ukri.org/gtr/api/organisations/7787...,1704709432000,Baltic Sea Cultural Centre in Gdansk,{'address': []}
3,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,77908BF8-1B2D-4D26-9119-155100E8B9C5,http://gtr.ukri.org/gtr/api/organisations/7790...,1704709432000,Mindray,"{'address': [{'links': None, 'ext': None, 'id'..."
4,{'link': [{'href': 'http://gtr.ukri.org/gtr/ap...,7794C645-9CC7-4913-A8DC-103AE0EFDD4B,http://gtr.ukri.org/gtr/api/organisations/7794...,1704709432000,Democracy International,"{'address': [{'links': None, 'ext': None, 'id'..."
