# Extract Cordis rows from df

In [1]:
# Import libraries
import pandas as pd
import ast
import numpy as np


In [2]:

# Cordis data will be in the labelled data
file_path = '../data/data.csv'

df = pd.read_csv(file_path)

# Formatting strings back into arrays
df['topics'] = df['topics'].apply(lambda x: ast.literal_eval(x) if x else [])

df.head()

Unnamed: 0,endDate,fundAmount,description,funder,url,status,name,originalID,fundAmountUK,dataSource,participants,topics,fundCurrency,grantCategory,startDate,fundScheme
0,2024-03-27 00:00:00+00:00,0,The search to understand the nature of the elu...,STFC,https://gtr.ukri.org/project/00014AFD-3C1F-410...,Active,Exploring the dark universe with quantum techn...,,,GTR,,[],GBP,Studentship,2020-09-28 00:00:00+00:00,
1,2019-05-31 00:00:00+00:00,71429,Chromition have developed innovative and disru...,HORIZON 2020,https://cordis.europa.eu/project/id/837088,CLOSED,DCDP - Digital Cancer Diagnosis Platform,837088.0,71429.0,CORDIS,,"[oncology, pathology, personalized medicine, p...",EUR,,2018-12-01 00:00:00+00:00,
2,2027-09-30 00:00:00+00:00,0,,BBSRC,https://gtr.ukri.org/project/00022364-C7A7-401...,Active,Exploring the role of vitamin transport in ins...,,,GTR,,[],GBP,Studentship,2023-10-01 00:00:00+00:00,
3,2014-09-30 00:00:00+00:00,325895,Our brains have to deal with ambiguity and unc...,BBSRC,https://gtr.ukri.org/project/0002BEF3-B070-448...,Closed,Bayesian issues in ant navigation,,,GTR,,"[Behavioural Ecology, Theoretical biology, Ani...",GBP,Research Grant,2011-08-01 00:00:00+00:00,
4,2019-02-28 00:00:00+00:00,149783,TB is transmitted via person-to-person aerosol...,HORIZON 2020,https://cordis.europa.eu/project/id/727695,CLOSED,TB-EPF - Enhanced Place Finding (EPF) of TB tr...,727695.0,31662.0,CORDIS,,"[public health, infectious diseases, tuberculo...",EUR,,2017-09-01 00:00:00+00:00,


### Using practices established in memory-test.csv

In [3]:
df = df.drop_duplicates(subset=['description', 'name'])
df = df.dropna(subset=['description'])
# Extract labelled data
df = df[df['topics'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
df.shape

(52664, 16)

In [4]:
# Extract rows where the 'dataSource' column is 'CORDIS'
df = df[df['dataSource'] == 'CORDIS']
# Filter down to description, name and topics
df = df[['description', 'name', 'topics']]
df.shape

(9839, 3)

In [5]:
df.head()

Unnamed: 0,description,name,topics
1,Chromition have developed innovative and disru...,DCDP - Digital Cancer Diagnosis Platform,"[oncology, pathology, personalized medicine, p..."
4,TB is transmitted via person-to-person aerosol...,TB-EPF - Enhanced Place Finding (EPF) of TB tr...,"[public health, infectious diseases, tuberculo..."
36,"""The origin of life is not well understood, an...",Autocat - Autocatalysis: A bottom-up approach ...,"[synthetic biology, catalysis, energy and fuels]"
70,"Functional encryption (FE), has been recently ...",FENTEC - Functional Encryption Technologies,"[internet of things, software, data protection..."
81,Project FLEX-RAIL has the vision to target a l...,FLEX-RAIL - Paradigm shifts for railway – Tech...,"[governance, business models]"


In [6]:
df.to_csv('../data/cordis-desc-name-topics.csv', index=False)

In [7]:
tel_topic_match = ["teleology","telecommunications","radio frequency","radar","mobile phones","bluetooth","WiFi","data networks","optical networks","microwave technology","radio technology","mobile radio","4G","LiFi","mobile network","radio and television","satellite radio","telecommunications networks","5G","fiber-optic network","cognitive radio","fixed wireless network",]

df["isTelecoms"] = df["topics"].apply(lambda s:  np.any([x in tel_topic_match for x in s]))

In [8]:
telecoms = df[df["isTelecoms"]]
telecoms.shape

(522, 4)

In [9]:
# Remove all topics that are not in tel_topic_match
telecoms["topics"] = telecoms["topics"].apply(lambda x: [y for y in x if y in tel_topic_match])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  telecoms["topics"] = telecoms["topics"].apply(lambda x: [y for y in x if y in tel_topic_match])


In [10]:
telecoms.head()

Unnamed: 0,description,name,topics,isTelecoms
258,Key objectives of METIS-II are to develop the ...,METIS-II - Mobile and wireless communications ...,"[5G, radio technology]",True
463,There is currently a high desire by manufactur...,interACT - Designing cooperative interaction o...,"[radar, mobile phones]",True
562,Delivering on the 5G promise of increased data...,5GCITY - 5GCITY,[5G],True
1228,From beacon fires in early civilizations to em...,Light UP - Visible Light Ultrafast Photodetect...,[radio technology],True
1389,5G-DRIVE will trial and validate the interoper...,5G-DRIVE - 5G HarmoniseD Research and TrIals f...,[5G],True


In [11]:
telecoms.to_csv('../data/cordis-telecoms.csv', index=False)

In [12]:
not_telecoms = df[~df["isTelecoms"]]

In [13]:
not_telecoms.shape

(9317, 4)

In [14]:
# Select 522 random samples from not_telecoms
not_telecoms = not_telecoms.sample(n=522, random_state=42)
not_telecoms.head()

Unnamed: 0,description,name,topics,isTelecoms
26533,Radiotherapy alone or in multimodality approac...,THERADNET - International NETwork for training...,"[immunology, oncology, personalized medicine]",False
129086,Age-related macular degeneration (AMD) is the ...,EYE-RISK - Exploring the combined role of gene...,"[epidemiology, proteins, ophthalmology, stem c...",False
120557,Rapidly rising CO2 levels threaten aquatic org...,"DOGMATiCC - Digestion, OsmoreGulation and Meta...","[fisheries, nutrition, climatic changes, homeo...",False
17245,This project aims to advance our understanding...,UnRi - Understanding the discourse-semantic sh...,[journalism],False
47958,"In today’s ageing European population, hearing...",HEAR-ECO - HEAR-ECO Innovative Hearing Aid Res...,"[public health, social psychology]",False


In [15]:
# combine with telecoms
combined = pd.concat([telecoms, not_telecoms])
# remove topics column
combined = combined.drop(columns=["topics"])
combined.shape

(1044, 3)

In [16]:
# save combined to csv
combined.to_csv('../data/cordis-binary-telecoms.csv', index=False)