# Counting the occurrences of `snt_id` in the `FS_ZS_JOINED` file, as well as in `simpletext-task2-test-large`, `simpletext-task2-test-medium`, and `simpletext-task2-test-small`

In [1]:
# Import libraries

import pandas as pd
from prettytable import PrettyTable


In [3]:
# Function to count records

def show_record_count(df):
	"""Display the total number of records in a DataFrame.

	Args:
		df (pandas.DataFrame): The DataFrame for which the number of records will be displayed.

	Returns:
		str: A string indicating the total number of records in the DataFrame.
	"""

	return f"Total records: {len(df)}"


In [25]:
# Constants

# Paths
DATA_PATH = "data/"
"""str: Path for the data."""

# File names
LARGE_FILE = "simpletext-task2-test-large"
"""str: Name of the large test file."""

MEDIUM_FILE = "simpletext-task2-test-medium"
"""str: Name of the medium test file."""

SMALL_FILE = "simpletext-task2-test-small"
"""str: Name of the small test file."""

FS_ZS_JOINED_FILE = "FS_ZS_JOINED"
"""str: Name of the Zero-Shot and Few-Shot file joined."""

FIRST_POPULATION_FILE = "FIRST_POPULATION"
"""str: Name of the first population file created."""

# Fields
SNT_ID_FIELD = "snt_id"
"""str: Name of the snt_id field."""

TERM_FIELD = "term"
"""str: Name of the term field."""

DIFFICULTY_FIELD = "difficulty"
"""str: Name of the difficulty field"""

TERM_RANK_FIELD = "term_rank_snt"
"""str: Name of the term_rank_snt field"""

DEFINITION_FIELD = "definition"
"""str: Name of the definition field"""

RUN_ID_FIELD = "run_id"
"""str: Name of the run_id field"""

MANUAL_FIELD = "manual"
"""str: Name of the manual field"""

PROMPT_ID_FIELD = "promptID"
"""str: Name of the promptID field"""

QUERY_ID_FIELD = "query_id"
"""str: Name of the query_id field"""

QUERY_TEXT_FIELD = "query_text"
"""str: Name of the query_text field"""

DOC_ID_FIELD = "doc_id"
"""str: Name of the doc_id field"""

SOURCE_FIELD = "source_snt"
"""str: Name of the source_snt field"""


'str: Name of the source_snt field'

In [5]:
# Read Excel files

fs_zs_joined = pd.read_excel(f"{DATA_PATH}{FS_ZS_JOINED_FILE}.xlsx", sheet_name=FS_ZS_JOINED_FILE)
large = pd.read_excel(f"{DATA_PATH}{LARGE_FILE}.xlsx", sheet_name=LARGE_FILE)
medium = pd.read_excel(f"{DATA_PATH}{MEDIUM_FILE}.xlsx", sheet_name=MEDIUM_FILE)
small = pd.read_excel(f"{DATA_PATH}{SMALL_FILE}.xlsx", sheet_name=SMALL_FILE)


In [6]:
print(show_record_count(fs_zs_joined[SNT_ID_FIELD]))

fs_zs_joined.head()


Total records: 9017


Unnamed: 0,snt_id,term,difficulty,term_rank_snt,definition,run_id,manual,promptID
0,G11.1_2892036907_2,Autonomy,2,1,the ability of a device to think and act indep...,SINAI_task_2.1_PRM_ZS_TASK2_2_V1,0,PRM_ZS_TASK2_2_V1
1,G11.1_2892036907_2,Unmanned,1,2,"without a human operator, Example: unmanned ae...",SINAI_task_2.1_PRM_ZS_TASK2_2_V1,0,PRM_ZS_TASK2_2_V1
2,G11.1_2892036907_2,Civilian,0,3,relating to ordinary citizens as opposed to th...,SINAI_task_2.1_PRM_ZS_TASK2_2_V1,0,PRM_ZS_TASK2_2_V1
3,G11.1_2892036907_3,Operation ceiling,2,1,The maximum height at which a drone can be flo...,SINAI_task_2.1_PRM_ZS_TASK2_2_V1,0,PRM_ZS_TASK2_2_V1
4,G11.1_2892036907_3,Road-tracking,1,2,A method of navigation which uses roads as a r...,SINAI_task_2.1_PRM_ZS_TASK2_2_V1,0,PRM_ZS_TASK2_2_V1


In [7]:
print(show_record_count(large[SNT_ID_FIELD]))

large.head()


Total records: 152072


Unnamed: 0,snt_id,source_snt,doc_id,query_id,query_text
0,G01.1_1564531496_1,In this short paper we describe the architectu...,1564531496,G01.1,Digital assistant
1,G01.1_1564531496_2,"A CDA is a mobile user device, similar to a Pe...",1564531496,G01.1,Digital assistant
2,G01.1_1564531496_3,It supports the citizen when dealing with publ...,1564531496,G01.1,Digital assistant
3,G01.1_1564531496_4,Requirements for secure and trusted interactio...,1564531496,G01.1,Digital assistant
4,G01.1_1564531496_5,The Citizen Digital Assistant eliminates these...,1564531496,G01.1,Digital assistant


In [8]:
print(show_record_count(medium[SNT_ID_FIELD]))

medium.head()


Total records: 4797


Unnamed: 0,query_id,query_text,doc_id,snt_id,source_snt
0,G11.1,drones,2892036907,G11.1_2892036907_1,"In the modern era of automation and robotics, ..."
1,G11.1,drones,2892036907,G11.1_2892036907_2,With the ever increasing number of unmanned ae...
2,G11.1,drones,2892036907,G11.1_2892036907_3,Due to guidelines set by the governments regar...
3,G11.1,drones,2892036907,G11.1_2892036907_4,In an attempt to achieve the above mentioned t...
4,G11.1,drones,2892036907,G11.1_2892036907_5,Derived from the classic image classification ...


In [9]:
print(show_record_count(small[SNT_ID_FIELD]))

small.head()


Total records: 2234


Unnamed: 0,query_id,query_text,doc_id,snt_id,source_snt
0,G11.1,drones,2892036907,G11.1_2892036907_1,"In the modern era of automation and robotics, ..."
1,G11.1,drones,2892036907,G11.1_2892036907_2,With the ever increasing number of unmanned ae...
2,G11.1,drones,2892036907,G11.1_2892036907_3,Due to guidelines set by the governments regar...
3,G11.1,drones,2892036907,G11.1_2892036907_4,In an attempt to achieve the above mentioned t...
4,G11.1,drones,2892036907,G11.1_2892036907_5,Derived from the classic image classification ...


In [10]:
# Number of snt_id in evrey file

table = PrettyTable(["Category", f"{SNT_ID_FIELD} amount"])
table.add_row(["FS_ZS_JOINED", fs_zs_joined[SNT_ID_FIELD].nunique()])
table.add_row(["LARGE", large[SNT_ID_FIELD].nunique()])
table.add_row(["MEDIUM", medium[SNT_ID_FIELD].nunique()])
table.add_row(["SMALL", small[SNT_ID_FIELD].nunique()])

print(table)


+--------------+---------------+
|   Category   | snt_id amount |
+--------------+---------------+
| FS_ZS_JOINED |      2234     |
|    LARGE     |     135540    |
|    MEDIUM    |      4797     |
|    SMALL     |      2234     |
+--------------+---------------+


In [11]:
# Remove columns for better data management

columns_to_delete_fs_zs_joined = [TERM_FIELD, DIFFICULTY_FIELD, TERM_RANK_FIELD, DEFINITION_FIELD, RUN_ID_FIELD, MANUAL_FIELD, PROMPT_ID_FIELD]
columns_to_delete_samll = [QUERY_ID_FIELD, QUERY_TEXT_FIELD, DOC_ID_FIELD, SOURCE_FIELD]

fs_zs_joined_with_columns_removed = fs_zs_joined.drop(columns_to_delete_fs_zs_joined, axis=1)
small_with_columns_removed = small.drop(columns_to_delete_samll, axis=1)


In [12]:
print(show_record_count(fs_zs_joined_with_columns_removed[SNT_ID_FIELD]))

fs_zs_joined_with_columns_removed.head()


Total records: 9017


Unnamed: 0,snt_id
0,G11.1_2892036907_2
1,G11.1_2892036907_2
2,G11.1_2892036907_2
3,G11.1_2892036907_3
4,G11.1_2892036907_3


In [13]:
print(show_record_count(small_with_columns_removed[SNT_ID_FIELD]))

small_with_columns_removed.head()


Total records: 2234


Unnamed: 0,snt_id
0,G11.1_2892036907_1
1,G11.1_2892036907_2
2,G11.1_2892036907_3
3,G11.1_2892036907_4
4,G11.1_2892036907_5


In [14]:
fs_zs_joined_unique_snt = fs_zs_joined_with_columns_removed.drop_duplicates(subset=[SNT_ID_FIELD])
small_unique_snt = small_with_columns_removed.drop_duplicates(subset=[SNT_ID_FIELD])


In [15]:
print(show_record_count(fs_zs_joined_unique_snt[SNT_ID_FIELD]))

fs_zs_joined_unique_snt.head()


Total records: 2234


Unnamed: 0,snt_id
0,G11.1_2892036907_2
3,G11.1_2892036907_3
6,G11.1_2892036907_4
11,G11.1_2892036907_5
14,G11.1_2892036907_6


In [16]:
print(show_record_count(small_unique_snt[SNT_ID_FIELD]))

small_unique_snt.head()


Total records: 2234


Unnamed: 0,snt_id
0,G11.1_2892036907_1
1,G11.1_2892036907_2
2,G11.1_2892036907_3
3,G11.1_2892036907_4
4,G11.1_2892036907_5


In [17]:
# Get SIMPLETEXT-TASK2-TEST-SMALL data not repeated in FS_ZS_JOINED

fs_zs_joined_index = fs_zs_joined.set_index([SNT_ID_FIELD]).index
small_index = small_unique_snt.set_index([SNT_ID_FIELD]).index

small_data_not_repeated_in_fs_zs_joined = small_unique_snt[~small_index.isin(fs_zs_joined_index)]

print(show_record_count(small_data_not_repeated_in_fs_zs_joined[SNT_ID_FIELD]))

small_data_not_repeated_in_fs_zs_joined.head()


Total records: 0


Unnamed: 0,snt_id


# Comparing `snt_id` in `FS_ZS_JOINED` with `snt_id` in `simpletext-task2-test-small`

In [18]:
# Removing columns for better data management

columns_to_delete_fs_zs_joined = [TERM_RANK_FIELD, RUN_ID_FIELD, MANUAL_FIELD, PROMPT_ID_FIELD]
columns_to_delete_small = [QUERY_ID_FIELD, DOC_ID_FIELD]

fs_zs_joined_with_columns_removed = fs_zs_joined.drop(columns_to_delete_fs_zs_joined, axis=1)
small_with_columns_deleted = small.drop(columns_to_delete_small, axis=1)


In [19]:
print(show_record_count(fs_zs_joined_with_columns_removed[SNT_ID_FIELD]))

fs_zs_joined_with_columns_removed.head()


Total records: 9017


Unnamed: 0,snt_id,term,difficulty,definition
0,G11.1_2892036907_2,Autonomy,2,the ability of a device to think and act indep...
1,G11.1_2892036907_2,Unmanned,1,"without a human operator, Example: unmanned ae..."
2,G11.1_2892036907_2,Civilian,0,relating to ordinary citizens as opposed to th...
3,G11.1_2892036907_3,Operation ceiling,2,The maximum height at which a drone can be flo...
4,G11.1_2892036907_3,Road-tracking,1,A method of navigation which uses roads as a r...


In [20]:
print(show_record_count(small_with_columns_removed[SNT_ID_FIELD]))

small_with_columns_deleted.head()


Total records: 2234


Unnamed: 0,query_text,snt_id,source_snt
0,drones,G11.1_2892036907_1,"In the modern era of automation and robotics, ..."
1,drones,G11.1_2892036907_2,With the ever increasing number of unmanned ae...
2,drones,G11.1_2892036907_3,Due to guidelines set by the governments regar...
3,drones,G11.1_2892036907_4,In an attempt to achieve the above mentioned t...
4,drones,G11.1_2892036907_5,Derived from the classic image classification ...


In [21]:
# Merge small data to fs_zs_joined

first_population = pd.merge(small_with_columns_deleted, fs_zs_joined_with_columns_removed, on=SNT_ID_FIELD, how="left")

first_population = first_population[[SNT_ID_FIELD, QUERY_TEXT_FIELD, SOURCE_FIELD, TERM_FIELD, DIFFICULTY_FIELD, DEFINITION_FIELD]]

print(show_record_count(first_population[SNT_ID_FIELD]))

first_population.head()


Total records: 9017


Unnamed: 0,snt_id,query_text,source_snt,term,difficulty,definition
0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Autonomous,1,able to act independently without requiring di...
1,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Robotics,1,"the science and technology of robots, their de..."
2,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Automation,1,the use of technology to automate tasks that w...
3,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Academic,0,of or relating to education or study at a scho...
4,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Industrial,0,relating to or involving the industry or manuf...


In [26]:
first_population.to_excel(f"{DATA_PATH}{FIRST_POPULATION_FILE}.xlsx", sheet_name=FIRST_POPULATION_FILE)


The `FIRST_POPULATION` file is the first dataset considered population