In [9]:
# Import libraries
import pandas as pd
import ast


# Load data using cuDF for GPU acceleration
file_path = '../data/desc-name-topics.csv'

# Using cuDF to read the CSV file into GPU memory
df = pd.read_csv(file_path)

df['topics'] = df['topics'].apply(lambda x: ast.literal_eval(x) if x else [])

df.head()

Unnamed: 0,description,name,topics
0,The search to understand the nature of the elu...,Exploring the dark universe with quantum techn...,[]
1,Chromition have developed innovative and disru...,DCDP - Digital Cancer Diagnosis Platform,"[oncology, pathology, personalized medicine, p..."
2,,Exploring the role of vitamin transport in ins...,[]
3,Our brains have to deal with ambiguity and unc...,Bayesian issues in ant navigation,"[Behavioural Ecology, Theoretical biology, Ani..."
4,TB is transmitted via person-to-person aerosol...,TB-EPF - Enhanced Place Finding (EPF) of TB tr...,"[public health, infectious diseases, tuberculo..."


In [10]:
df.shape

(158407, 3)

### Finding repeated values so that they can be removed

In [11]:
# Print repeated values from the description and name columns
print(df['description'].value_counts())
print(df['name'].value_counts())


TBC 2024/25                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [12]:
# Lets remove all of the rows where the value in 'description' or 'name' appears more than once in the dataset
df = df.drop_duplicates(subset=['description', 'name'])
df.shape

(147475, 3)

In [13]:
# Remove rows where the 'description' column is missing a value
df = df.dropna(subset=['description'])
df.shape

(127405, 3)

### Separating into labelled and unlabelled

In [14]:
# Now we separate the data based on whether there is a value in the topics column or it is an empty list
# This will serve as our labelled and unlabelled data
labelled_data = df[df['topics'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
unlabelled_data = df[df['topics'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)]

print('labelled: \t',labelled_data.shape)
print('unlabelled: \t',unlabelled_data.shape)

labelled: 	 (52664, 3)
unlabelled: 	 (74741, 3)


In [15]:
labelled_data.to_csv('../data/labelled-desc-name-topics.csv', index=False)
unlabelled_data.to_csv('../data/unlabelled-desc-name-topics.csv', index=False)