In [1]:
import pandas as pd
import numpy as np

# Load the data
data_path = 'data/theming_data_iwgc_25_03_2024.xlsx'  # Update the path if needed
data = pd.read_excel(data_path)
print(data.head())

num_labels = 15

       Question                                           freetext  \
0  Improvements  Wash and sanitise properly instead of doing it...   
1  Improvements  Maybe have STR contacts number as only had the...   
2  Improvements  Take on the problem, rather than rushing to di...   
3  Improvements  Take on the problem, rather than rushing to di...   
4  Improvements  Your channels of communication are old fashion...   

                         Theme  
0                  Cleanliness  
1  CommunicationContactDetails  
2         TreatmentIneffective  
3              TreatmentRushed  
4  CommunicationContactDetails  


In [2]:
def prepare_dataset(dataset, new_filename, text_column_name, label_column_name):
    # dictionary of lists
    dict_dataset = {'text': dataset[text_column_name], 'label': dataset[label_column_name]}
    
    df = pd.DataFrame(dict_dataset)
        
    # saving the dataframe
    df.to_csv(new_filename)
    return df

df = prepare_dataset(data, 'theme.csv', 'freetext', 'Theme')

In [3]:
def keep_top_n_classes(df, num_labels):
    df = df.loc[df['label'].isin(df['label'].value_counts().head(num_labels).index)]
    return df
top_df = keep_top_n_classes(df, num_labels)

In [4]:
top_df.head(10)


Unnamed: 0,text,label
6,Pharmacy wait times \nWaited nearly an hour fo...,TimeWaitingUnspecified
13,to improve waiting time,TimeWaitingUnspecified
15,Giving the patient their diagnosis on paper ra...,CommunicationInformationForPatients
18,Surprised that there were only 3 appointments.,TreatmentMore
20,None really as far as District Nurses were con...,CommunicationWithPatients
21,None really as far as District Nurses were con...,TimeWaitingUnspecified
23,Have a client facilitate with staff as a volun...,StaffAvailability
24,Have a client facilitate with staff as a volun...,StaffKnowledgeTraining
30,More sessions,TreatmentMore
32,"Maurice was nosey, and overbearing,",StaffProfessionalism


In [5]:
for i, label in enumerate(top_df['label'].unique()):
        top_df['label'].replace({label:i}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_df['label'].replace({label:i}, inplace=True)


In [6]:
grouped_texts = top_df.groupby('text').agg({'label': lambda x: x.tolist()})
grouped_texts = grouped_texts.reset_index(drop=False)
grouped_texts['label'] = grouped_texts['label']
grouped_texts.head()

Unnamed: 0,text,label
0,\nStaffing shortages and cut backs are taking ...,[4]
1,#\nStaff very friendly always there if needed ...,"[10, 11]"
2,"'Cos it is good, they helped me and you have h...",[11]
3,* As someone who prefers face-to-face appointm...,"[3, 4, 8, 9]"
4,* Give use more preparation to what to bring t...,[1]


In [7]:
label_lists = grouped_texts['label'].tolist()
sparse_label_lists = []
for labels in label_lists:
    mat = np.zeros(num_labels)
    for label in labels:
        mat[label] = 1
    sparse_label_lists.append(mat)

In [8]:
grouped_texts['label'] = sparse_label_lists
grouped_texts.head()

Unnamed: 0,text,label
0,\nStaffing shortages and cut backs are taking ...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,#\nStaff very friendly always there if needed ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"'Cos it is good, they helped me and you have h...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,* As someone who prefers face-to-face appointm...,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
4,* Give use more preparation to what to bring t...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
grouped_texts.to_csv('multi-label-theme.csv')