## Loading/Preprocessing the NHS TRUD DATA into a SQLite database

In [1]:
#Loading the TRUD Data into a SQLite database
#Loading and Imporrting relavent packages
import sqlite3
import pandas as pd

# Processing and Loading the description table
decrip_df = pd.read_csv("/Users/laurakhaukha/Desktop/Diss_data/SnomedCT_UKClinicalRF2_PRODUCTION_20241120T000001Z/Snapshot/Terminology/sct2_Description_UKCLSnapshot-en_GB1000000_20241120.txt", sep="\t", dtype=str)
decrip_df.columns = map(str.lower, decrip_df.columns) #standardise column names, making everything lowercase 
decrip_df = decrip_df[decrip_df["active"] == "1"] #Filtering the active entries to include for the model, take away the inactive ones
#/Users/laurakhaukha/Desktop/Diss_data/SnomedCT_UKClinicalRefsetsRF2_PRODUCTION_20241120T000001Z/Snapshot/Terminology/sct2_Description_UKCRSnapshot-en_GB1000000_20241120.txt

# Processing and loading the relationships table
rela_df = pd.read_csv("/Users/laurakhaukha/Desktop/Diss_data/SnomedCT_UKClinicalRF2_PRODUCTION_20241120T000001Z/Snapshot/Terminology/sct2_Relationship_UKCLSnapshot_GB1000000_20241120.txt", sep="\t", dtype=str)
rela_df.columns = map(str.lower, rela_df.columns) #standardise column names, making everything lowercase 
rela_df = rela_df[rela_df["active"] == "1"]  #Filtering the active entries to include for the model, take away the inactive ones

# # Preprocessing and loading the language tables
# langu_df = pd.read_csv("/Users/laurakhaukha/Desktop/Diss_data/SnomedCT_UKClinicalRF2_PRODUCTION_20241120T000001Z/Snapshot/Refset/Language/der2_cRefset_LanguageUKCRSnapshot-en_GB1000000_20241120.txt",sep="\t", dtype=str)
# langu_df.columns = langu_df.columns.str.lower()
# langu_df = langu_df[langu_df["active"] == "1"]

# Creating the SQLite database
conn = sqlite3.connect("snomedct_data.db")

In [3]:
#Writing the description and relationship tables to the database
decrip_df.to_sql("TRUD_descriptiontable", conn, if_exists="replace", index=False)
rela_df.to_sql("TRUD_relationshiptable", conn, if_exists="replace", index=False)
conn.commit()
conn.close()

# Verifying the database 
conn = sqlite3.connect("snomedct_data.db")
cur = conn.cursor() #Establisshing the Cursor object, used to execute SQL commands
#Listing the tables 
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cur.fetchall())

#Looking at the contents
sqllite_df = pd.read_sql_query("SELECT * FROM TRUD_descriptiontable LIMIT 10", conn)
print(sqllite_df.head())
 #Closing the connecting

#I do not need moduleid and casesignificanceid for the prompts 
# sourceid and concpetid Are the same 
# relationship table descirebs the child and parent relationships 

[('TRUD_descriptiontable',), ('TRUD_relationshiptable',)]
            id effectivetime active            moduleid       conceptid  \
0  11000000117      20040131      1  999000011000000103  20881000000105   
1  21000000111      20040131      1  999000011000000103     11000000101   
2  21000237110      20230607      1  999000011000000103    861000237103   
3  31000000113      20040131      1  999000011000000103  20891000000107   
4  31000237112      20230607      1  999000011000000103   6561000237105   

  languagecode              typeid  \
0           en  900000000000013009   
1           en  900000000000013009   
2           en  900000000000003001   
3           en  900000000000013009   
4           en  900000000000013009   

                                                term  casesignificanceid  
0                           Read Code Administration  900000000000020002  
1                         Read Code Administration 1  900000000000020002  
2  Mass concentration of drug in urin

## EDA (Exploriratory Data Analysis) 

In [5]:
conn_1 = sqlite3.connect("snomedct_data.db")
pd.read_sql_query("SELECT * FROM TRUD_descriptiontable LIMIT 5", conn_1)

Unnamed: 0,id,effectivetime,active,moduleid,conceptid,languagecode,typeid,term,casesignificanceid
0,11000000117,20040131,1,999000011000000103,20881000000105,en,900000000000013009,Read Code Administration,900000000000020002
1,21000000111,20040131,1,999000011000000103,11000000101,en,900000000000013009,Read Code Administration 1,900000000000020002
2,21000237110,20230607,1,999000011000000103,861000237103,en,900000000000003001,Mass concentration of drug in urine (observabl...,900000000000020002
3,31000000113,20040131,1,999000011000000103,20891000000107,en,900000000000013009,Read Code Administration 2,900000000000020002
4,31000237112,20230607,1,999000011000000103,6561000237105,en,900000000000013009,Calcium percent in calculus,900000000000020002


In [7]:
#EDA, finding the concept id that is the most common: 
concptcounts = decrip_df["conceptid"].value_counts().reset_index()
print(F"{concptcounts}: Concpet ID counts")
concptcounts.head(43)

#253571000000106

                 conceptid  count
0          466891000000108     20
1          386721000000105     14
2          256691000000101     14
3          413681000000103     14
4          466871000000109     13
...                    ...    ...
100386     431781000000100      2
100387     390911000000104      2
100388     414021000000105      2
100389  999003041000000106      2
100390    1110451000000102      1

[100391 rows x 2 columns]: Concpet ID counts


Unnamed: 0,conceptid,count
0,466891000000108,20
1,386721000000105,14
2,256691000000101,14
3,413681000000103,14
4,466871000000109,13
5,470791000000108,12
6,368851000000106,12
7,401991000000109,12
8,468611000000102,12
9,431821000000108,12


In [9]:
#EDA
concept_id__ = "253571000000106"
query = f"""SELECT term, typeid FROM TRUD_descriptiontable WHERE conceptid = '{concept_id__}'"""
pd.read_sql_query(query, conn)

Unnamed: 0,term,typeid
0,Implantation of neurostimulator into periphera...,900000000000013009
1,Implantation of electronic stimulator into per...,900000000000013009
2,Implantation of electrode into peripheral nerve,900000000000013009
3,Insertion of electrode into peripheral nerve,900000000000013009
4,Insertion of pacemaker into peripheral nerve,900000000000013009
5,Insertion of neural pacemaker into peripheral ...,900000000000013009
6,Implantation of neuropacemaker into peripheral...,900000000000013009
7,Insertion of neurostimulator electrodes into p...,900000000000003001
8,Insertion of neurostimulator electrodes into p...,900000000000013009


In [None]:
#Preporcoessing the data 
# map type ids to the type label: 900000000000003001(FSN), 900000000000013009: (Synonym)
# Potentiallly Join relashionship and description table on concept id and source id 
#Filter to the UK- preferred terms using the language table : Filter the synonoym with typeid 
#Keep useful columns for the prompts: conceptid, term, typeid, sourceid, destinationid, typeid 
#Creating Prompts 
# Example 
# #Defining the tables used in the prompts
# schem = """ Given the tables:
#      TRUD_descriptiontable(conceptid, term, typeid) TRUD_relationshiptable(sourceid, destinationid, typeid)
#      Provide a SQL Query to answer the following question """

# # Create an expected SQL query
# # Find concept ids that are rich in info 
# Given tables : ... 
# What are the synonyms for "term" 
# What is the code for term " "
# Count the number of synonyms, concepts etc...
# Get all the descrioptions based on the type id 

## Creating the PROMPTS

In [25]:
# Category 1: Looking up concepts 
# 10 SQL-NL prompts 
#Ttypeid :900000000000003001, Fuly specified name
#Typepid: 900000000000013009, synonym 
#Typepid: 900000000000013009, synonym 
#900000000000550004	Definition

slq_promts_category1 = [
    ("What is the preferred term for concept ID 466891000000108 ?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '466891000000108' AND typeid = '900000000000003001'"),
    ("What is the full name for concept ID 386721000000105?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '386721000000105' AND typeid = '900000000000003001'"),
    ("Find the preferred name of concept ID 413681000000103.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '413681000000103' AND typeid = '900000000000013009'"),
    ("What is the English name of concept ID 466871000000109 ?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '466871000000109' AND languagecode = 'en' AND typeid = '900000000000003001'"),
    ("Give the preferred description of concept ID 470791000000108.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '470791000000108' AND typeid = '900000000000003001'"),
    ("What description is used as the preferred term for concept ID 368851000000106?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '368851000000106' AND typeid = '900000000000003001'"),
    ("What is the label of concept 401991000000109?",
     "SELECT term FROM TRUD_descriptiontable WHERE concecptid = '401991000000109' AND typeid = '900000000000003001'"),
    ("Get the preferred English label for conceptID 431821000000108.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '431821000000108	' AND typeid = '900000000000003001' AND languagecode = 'en'"),
    ("What name is preferred for the concept 443791000000100?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '443791000000100' AND typeid = '900000000000013009'"),
    ("Give me the preferred description for concept 467081000000103.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '467081000000103' AND typeid = '900000000000003001'"), #Part 2
    ("Which term is listed as preferred for concept 401881000000100?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '401881000000100' AND typeid = '900000000000003001'"),
    ("For concept ID 465351000000104 what is the preffered term ?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '465351000000104' AND typeid = '900000000000003001'"), 
    ("Show me which term corresponds to conceptid 271411000000102",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '271411000000102' AND typeid = '900000000000013009'"),
    ("Provide the english term of concept ID 441901000000108 ?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '441901000000108' AND languagecode = 'en' AND typeid = '900000000000003001'"),
    ("Provide the preferred description for the concept ID 19551000000101.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '19551000000101' AND typeid = '900000000000003001'"),
    ("Which descriptions are used for concept ID 368951000000101?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '368951000000101' AND typeid = '900000000000003001'"),
    ("Retrieve the label of concept 401761000000100?",
     "SELECT term FROM TRUD_descriptiontable WHERE concecptid = '401761000000100' AND typeid = '900000000000003001'"),
    ("I want the english term for concept ID 467211000000103	.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '467211000000103	' AND typeid = '900000000000003001' AND languagecode = 'en'"),
    ("Which preferred name corresponds to concept 20121000000101?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '20121000000101' AND typeid = '900000000000013009'"),
    ("Find the description for concept 253571000000106.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '253571000000106' AND typeid = '900000000000003001'")]

In [32]:
# Category 2: Finding Terms and Synonmonymms 

sql_promts_category2 = [
    ("What are all known synonyms for concept ID 427301000000100.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '427301000000100'"),
    ("List all the different descriptions for concept 270721000000106?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '270721000000106'"),
    ("Get all English terms for the concept ID 452161000000101.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '452161000000101' AND languagecode = 'en'"),
    ("Retrieve all labels for concept ID 411771000000105.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '411771000000105'"),
    ("Show all descriptions including synonyms for concept 441591000000105.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '441591000000105'"),
    ("What are the known terms for concept ID 470961000000100?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '470961000000100'"),
    ("Give me all descriptions linked to concept 491431000000104.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '491431000000104"),
    ("Get every English label for concept ID 401971000000105.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '401971000000105' AND languagecode = 'en'"),
    ("Return all terms used to describe concept 486081000000102.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '486081000000102'"),
    ("List every description for concept 418751000000106.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '418751000000106'")] # Part 2 
     ("Find every synonym for concept ID 487501000000104.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '487501000000104'"),
     ("Show all the descriptions for concept 271321000000100?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '271321000000100'"),
     ("Get all English terms for the concept ID 271751000000109.",
      "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '271751000000109' AND languagecode = 'en'"),
     ("I want all labels for concept ID 455621000000108.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '455621000000108'"),
    ("Display all the descriptions including synonyms for concept 368961000000103",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '368961000000103'"),   
    ("Find all the terms associated with conceptID 455821000000109?",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '455821000000109'"),
    ("Fetch all the terms linked to concept 467251000000104.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '467251000000104"),
    ("Provide each english label for concept ID 255871000000109.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '255871000000109' AND languagecode = 'en'"),
    ("Return all terms used to describe concept 443991000000102.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '443991000000102'"),
    ("List every description for concept 467591000000109.",
     "SELECT term FROM TRUD_descriptiontable WHERE conceptid = '467591000000109'")]
#NEED TO involve type id for synonsm 

IndentationError: unindent does not match any outer indentation level (<string>, line 32)

In [58]:
# Looking at the most popular terms
sqlquery = """SELECT term, COUNT(*) as frequency FROM TRUD_descriptiontable GROUP BY term ORDER BY frequency DESC LIMIT 20"""
temrs_most = pd.read_sql_query(sqlquery, conn)
temrs_most

Unnamed: 0,term,frequency
0,Hypodermic needle injury,23
1,Accident caused by sharp-edged object,22
2,High cost drugs,16
3,Maltreatment,15
4,High cost chemotherapy drugs,15
5,Transarterial approach,14
6,Reconstruction of cranial defect,14
7,Upper limb,12
8,Excision of lesion of brain tissue,12
9,Distraction osteogenesis of bones of skull,12


In [None]:
# Category 3: Finding the typeid and concept id from the of a term or concept

#Typeids
sql_promts_category3 =
    [("What is the typeid associated with 'Hypodermic needle injury'?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Hypodermic needle injury'"),
    ("Is 'Accident caused by sharp-edged object' a synonym or a preferred term?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Accident caused by sharp-edged object'")
    ("Retrieve the typeid for the term 'High cost drugs'.",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'High cost drugs'"),  
    ("What is the type for the description 'Maltreatment'?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Maltreatment'"),
    ("Provide the description type of 'High cost chemotherapy drugs'.",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'High cost chemotherapy drugs'"),
    ("What is the typeid for the term 'Transarterial approach'?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Transarterial approach'"),
    ("Which type ID corresponds to the term 'Reconstruction of cranial defect'?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Reconstruction of cranial defect'"),
    ("What typeid is 'Upper limb?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Upper limb	"),
    ("Find the typeid for 'Excision of lesion of brain tissue'.",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Excision of lesion of brain tissue"),
    ("What is the description type for 'Distraction osteogenesis of bones of skull'?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Distraction osteogenesis of bones of skull'")]
#Part 2 # Operate on concept ids here
    [("Find the conceptid for 'Operations on bones of skull'?",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Operations on bones of skull'"),
    ("Is 'Operations for disorders of sex development' a synonym or a preferred term?",
     "SELECT typeid FROM TRUD_descriptiontable WHERE term = 'Operations for disorders of sex development'")
    ("List the conceptid's associated with the term 'Destruction of fetus'.",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Destruction of fetus'"),
    ("What is the conceptid for 'Wound microscopy, culture and sensitivities'?",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Wound microscopy, culture and sensitivities'"),
    ("Display the conceptid for'Urine maltose level'.",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Urine maltose level'"),
    ("Which concpetid corresponds with the term 'Urinary microscopy, culture and sensitivities'?",
     "SELECT concpetid FROM TRUD_descriptiontable WHERE term = 'Urinary microscopy, culture and sensitivities'"),
    ("Which conceptid is asssocaited with the term 'Treponema pallidum particle agglutination test'?",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Treponema pallidum particle agglutination test'"),
    ("Under what conceptid is 'Transplantation of stomach?",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Transplantation of stomach'"),
    ("Return the conceptid for 'Therapeutic endoscopic operations on nasal cavity'.",
     "SELECT conceptid FROM TRUD_descriptiontable WHERE term = 'Therapeutic endoscopic operations on nasal cavity"),
    ("Which conceptid is 'Sickle solubility test'?",
     "SELECT conceptid  FROM TRUD_descriptiontable WHERE term = 'Sickle solubility test'")]
# Add 10 prompts

In [None]:
# 0	466891000000108	20
# 1	386721000000105	14
# 2	256691000000101	14
# 3	413681000000103	14
# 4	466871000000109	13
# 5	470791000000108	12
# 6	368851000000106	12
# 7	401991000000109	12
# 8	468611000000102	12
# 9	431821000000108	12
# 10	443791000000100	11
# 11	467081000000103	11
# 12	401881000000100	11
# 13	465351000000104	11
# 14	271411000000102	11
# 15	441901000000108	11
# 16	19551000000101	10
# 17	368951000000101	10
# 18	401761000000100	10
# 19	467211000000103	10
# 20	20121000000101	9
# 21	253571000000106	9
# 22	427301000000100	9
# 23	270721000000106	9
# 24	452161000000101	9
# 25	411771000000105	9
# 26	441591000000105	9
# 27	470961000000100	9
# 28	443761000000106	9
# 29	491431000000104	9
# 30	401971000000105	9
# 31	486081000000102	9
# 32	418751000000106	9
# 33	487501000000104	9
# 34	271321000000100	8
# 35	271751000000109	8
# 36	455621000000108	8
# 37	368961000000103	8
# 38	455821000000109	8
# 39	467251000000104	8
# 40	255871000000109	8
# 41	443991000000102	8
# 42	467591000000109	8

In [15]:
# Category 4: Finding relashionships betweeen concepts 

sql_prompts_category4 = [
    ("Which concepts are associated with concept ID 466891000000108?",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '466891000000108'"),
    ("Provide the destination for  386721000000105.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '386721000000105'"),
    ("What are the source concepts pointing to 256691000000101?",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '256691000000101'"),
    ("Provide all the sources concept 413681000000103 connects to.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '413681000000103'"),
    ("Which concepts are the targets of 466871000000109?",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '466871000000109'"),
    ("Show all concepts that link to destination 4707910000001081.",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '470791000000108'"),
    ("Identify the source concepts associated with destination ID 368851000000106.",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '368851000000106'"),
    ("Which concepts are connected to 401991000000109 as destination?",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '401991000000109'"),
    ("Show all concepts related from source 468611000000102.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '468611000000102'"),
    ("List the concepts that 431821000000108 refers to.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '431821000000108'"), #Part 2
    ("Which concepts reference 443791000000100 as a destination?",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '443791000000100'"),
    ("List all destination concepts for concept 467081000000103.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '467081000000103'"),
    ("Get all related concepts from 401881000000100.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '401881000000100'"),
    ("Find all links that point to destination 465351000000104.",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '465351000000104'"),
    ("Return all source IDs for destination concept 271411000000102.",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '271411000000102'"),
    ("Find the destination ids that link to 441901000000108",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '441901000000108'"),
    ("Find all concepts that map to 19551000000101.",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '19551000000101'"),
    ("What are the source concepts that lead to 368951000000101?",
     "SELECT sourceid FROM TRUD_relationshiptable WHERE destinationid = '368951000000101'"),
    ("Give me all destination concepts connected from 401761000000100.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '401761000000100'"),
    ("List concepts related from 467211000000103.",
     "SELECT destinationid FROM TRUD_relationshiptable WHERE sourceid = '467211000000103'")]

In [17]:
# Here include the caclulation category: COUNT, GROUP BY, MIN, MAX, SUM,  AVG

sql_prompts_category5= [
    ("How many distinct relationships exist in the table?",
     "SELECT COUNT(*) FROM TRUD_relationshiptable"),
    ("How many destination concepts are linked from concept ID 466891000000108?",
     "SELECT COUNT(destinationid) FROM TRUD_relationshiptable WHERE sourceid = '466891000000108'"),
    ("Count how many sourceids are assocaited with concept ID 368951000000101.",
     "SELECT COUNT(sourceid) FROM TRUD_relationshiptable WHERE destinationid = '368951000000101'"),
    ("Provide the total number of the unique sourceids from the relationship table.",
     "SELECT COUNT(DISTINCT sourceid) FROM TRUD_relationshiptable"),
    ("What is the maximum number of conceptids in the table?",  
     "SELECT MAX(conceptid) FROM TRUD_descriptiontable"),
    ("What is the mimimum number of conceptid in the table?",  
     "SELECT MIN(conceptid) FROM TRUD_descriptiontable"),
    ("How many destinationid's are present in the schema ?", 
     "SELECT COUNT(destinationid) FROM TRUD_descriptiontable"),
    ("Find the sum of terms associated with the concpet id 466891000000108", 
     "SELECT SUM(conceptid) FROM TRUD_relationshiptable WHERE conceptid = 466891000000108"),
    ("Provide total count of conceptids that start with the number 4",
     "SELECT COUNT(conceptid) FROM TRUD_relationshiptable WHERE conceptid LIKE '4%'"),
    ("Retrive total number of terms that start with the number heart",
     "SELECT COUNT(term) FROM TRUD_relationshiptable WHERE term LIKE = 'heart%'"),
    ("How many terms with the conceptid 466891000000108, end with injury?", 
      "SELECT terms FROM TRUD_relationshiptable WHERE term LIKE = '%injury' AND conceptid = 466891000000108"),  #11
    ("Find the maximum number of relationships per source ID.",
     "SELECT MAX(relationship_count) FROM (SELECT sourceid, COUNT(*) AS relationship_count FROM TRUD_relationshiptable GROUP BY sourceid) AS counts"),
    ("Count the number of relationships per type ID.",
     "SELECT typeid, COUNT(*) FROM TRUD_relationshiptable GROUP BY typeid"), 
    ("Count how many descriptions contain the word 'disease'",
     "SELECT COUNT(*) FROM TRUD_descriptiontable WHERE term LIKE '%disease%'"),
    ("What is the number of relationships per type ID ?.",
     "SELECT typeid, COUNT(*) FROM TRUD_relationshiptable GROUP BY typeid"),
    ("Find the source ID with the fewest destination concepts.",
     "SELECT sourceid, COUNT(destinationid) AS count_dest FROM TRUD_relationshiptable GROUP BY sourceid ORDER BY count_dest ASC LIMIT 1"),
    ("Get the total number of descriptions per concept ID.",
     "SELECT conceptid, COUNT(*) AS description_count FROM TRUD_descriptiontable GROUP BY conceptid"),
    ("How many terms are associated with each language code?",
     "SELECT languagecode, COUNT(*) FROM TRUD_descriptiontable GROUP BY languagecode"),
    ("Which language code has the highest number of terms?",
     "SELECT languagecode, COUNT(*) AS term_count FROM TRUD_descriptiontable GROUP BY languagecode ORDER BY term_count DESC LIMIT 1"),
    ("What is the maximum concept ID value in the description table?",
     "SELECT MAX(CAST(conceptid AS UNSIGNED)) FROM TRUD_descriptiontable"),
    ("What is the minimum concept ID value in the description table?",
     "SELECT MIN(CAST(conceptid AS UNSIGNED)) FROM TRUD_descriptiontable")] 

# 21 here

In [None]:
#Filtering the terms associated with specific module ids 
sql_prompts_category6 = [("What are all terms with module ID 999000041000000102?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000041000000102'"),
                         ("What are all terms with module ID 900000000000207008?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '900000000000207008'"),
                         ("What are all terms with module ID 999000031000000106?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000031000000106'"),
                         ("What are all terms with module ID 999000021000000104?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000021000000104'"),
                         ("What are all terms with module ID 999000011000000101?", 
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000011000000101'"),
                         ("What are all terms with module ID 999000001000000107?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000001000000107'"),
                         ("What are all terms with module ID 999000051000000100?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000051000000100'"),
                         ("What are all terms with module ID 999000061000000105?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000061000000105'"),
                         ("What are all terms with module ID 999000071000000108?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000071000000108'"),
                         ("What are all terms with module ID 999000081000000103?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000081000000103'),
                          ("What are all terms with module ID 999000091000000106?",
                           "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000091000000106'"),
                         ("What are all terms with module ID 999000101000000109?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000101000000109'"),
                         ("What are all terms with module ID 999000111000000105?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000111000000105'"),
                         ("What are all terms with module ID 999000121000000108?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000121000000108'"),
                         ("What are all terms with module ID 999000131000000101?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000131000000101'"),
                         ("What are all terms with module ID 999000141000000103?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000141000000103'"),
                         ("What are all terms with module ID 999000211000000107?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000211000000107'"),
                         ("What are all terms with module ID 999000221000000109?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000221000000109'"),
                         ("What are all terms with module ID 999000231000000104?",
                          "SELECT term FROM TRUD_descriptiontable WHERE moduleid = '999000231000000104'")]

## Evalutating the prompts for the T5 model 

In [None]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.30.0->tr

In [24]:
!pip install sentencepiece



In [32]:
# # Importing the relevaent libaries
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# # Loading the model and the relevant tokeniser 
# t5_tokeniser = T5Tokenizer.from_pretrained("t5-small")
# T5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
#THe Problem is I choose the wrong model, the model is translating the input no creating SQL prompts

In [33]:
# Looping through each prompt tokenisesing them and feeding them into the model to generate output
# FIX THISL: MODELS OUPUT IN GERMNAN:
for i, (propt, esql) in enumerate(slq_promts_category1 ):
    input = t5_tokeniser(propt, return_tensors="pt").input_ids
    outputs_ = T5_model.generate(input , max_length=100)
    ouput_sql = t5_tokeniser.decode(outputs_[0], skip_special_tokens=True)
    print(f"Example {i+1}")
    print("Provided Prompt: ", propt)
    print("Expected SQL: ", esql)
    print("Output SQL: ", ouput_sql)

Example 1
Provided Prompt:  What is the preferred term for concept ID 466891000000108 ?
Expected SQL:  SELECT term FROM TRUD_descriptiontable WHERE conceptid = '466891000000108' AND typeid = '900000000000003001'
Output SQL:  Was ist der Begriff für den Begriff ID 466891000000108?
Example 2
Provided Prompt:  What is the full name for concept ID 386721000000105?
Expected SQL:  SELECT term FROM TRUD_descriptiontable WHERE conceptid = '386721000000105' AND typeid = '900000000000003001'
Output SQL:  Was ist der Name voll für den Begriff ID 386721000000105?
Example 3
Provided Prompt:  Find the preferred name of concept ID 413681000000103.
Expected SQL:  SELECT term FROM TRUD_descriptiontable WHERE conceptid = '413681000000103' AND typeid = '900000000000013009'
Output SQL:  Find the preferred name of concept ID 413681000000103.
Example 4
Provided Prompt:  What is the English name of concept ID 466871000000109 ?
Expected SQL:  SELECT term FROM TRUD_descriptiontable WHERE conceptid = '466871000

In [None]:
## Evaluation