# Copying sample to population

In [17]:
# Import libraries

import pandas as pd
import re


In [18]:
# Function to count records

def show_record_count(df):
	"""Display the total number of records in a DataFrame.

	Args:
		df (pandas.DataFrame): The DataFrame for which the number of records will be displayed.

	Returns:
		str: A string indicating the total number of records in the DataFrame.
	"""

	return f"Total records: {len(df)}"


In [19]:
# Constants

# Paths
DATA_PATH = "data/"
"""str: Path for the data."""

# File names
CORPUS_POPULATION_FILE = "CORPUS_POPULATION"
"""str: Name of the corpus population file created."""

POPULATION_SEPARATED_FILE = "POPULATION_SEPARATED"
"""str: Name of the population_separated file."""

SAMPLE_FILE = "SAMPLE"
"""str: Name of the sample file."""

CORPUS_SAMPLE_FILE = "CORPUS_SAMPLE"
"""str: Name of the corpus_sample file."""

# Fields
SNT_ID_FIELD = "snt_id"
"""str: Name of the snt_id field."""

GPT_COMPLEX_WORD_FIELD = "complex_word_gpt3"
"""str: Name of the complex_word_gpt3 field."""

QUERY_TEXT_FIELD = "query_text"
"""str: Name of the query_text field"""


'str: Name of the query_text field'

## Formatting sample

In [20]:
sample = pd.read_excel(f"{DATA_PATH}{SAMPLE_FILE}.xlsx", sheet_name=SAMPLE_FILE)

print(show_record_count(sample[SNT_ID_FIELD]))

sample.head()


Total records: 589


Unnamed: 0,snt_id,query_text,source_snt_comp,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
0,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",End-to-End Encryption,2.0,A method whereby data is encrypted before it i...,Sending a message via an encrypted messaging app,"To protect sensitive data, such as financial i..."
1,,,,Encryption Key,2.0,A key used to encrypt and decrypt data.,A key used to encrypt a message before it is sent,To protect data from.
2,,,,Information-Centric Networking (ICN),1.0,A type of network architecture in which the da...,A network where data is stored in caches and c...,Caching data to improve access speed and relia...
3,,,,Content Caching,0.0,A method whereby data is stored on an intermed...,Downloading a web page from a local server ins...,To improve access speed and reduce bandwidth u...
4,G17.2_2893955699_1,qbit,Concatenated Greenberger‚ÄìHorne‚ÄìZeilinger (...,qubit,1.0,"A qubit is a unit of quantum information, whic...",An electron can be used as a qubit.,Qubits are used in quantum computing to store ...


In [21]:
# Fill the snt_id column

for col in sample.columns[:3]:
  value = None

  for i, current_value in enumerate(sample[col]):
    if pd.notna(current_value):
      value = current_value
    else:
      sample[col][i] = value

print(show_record_count(sample[SNT_ID_FIELD]))

sample.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample[col][i] = value


Total records: 589


Unnamed: 0,snt_id,query_text,source_snt_comp,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
0,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",End-to-End Encryption,2.0,A method whereby data is encrypted before it i...,Sending a message via an encrypted messaging app,"To protect sensitive data, such as financial i..."
1,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Encryption Key,2.0,A key used to encrypt and decrypt data.,A key used to encrypt a message before it is sent,To protect data from.
2,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Information-Centric Networking (ICN),1.0,A type of network architecture in which the da...,A network where data is stored in caches and c...,Caching data to improve access speed and relia...
3,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Content Caching,0.0,A method whereby data is stored on an intermed...,Downloading a web page from a local server ins...,To improve access speed and reduce bandwidth u...
4,G17.2_2893955699_1,qbit,Concatenated Greenberger‚ÄìHorne‚ÄìZeilinger (...,qubit,1.0,"A qubit is a unit of quantum information, whic...",An electron can be used as a qubit.,Qubits are used in quantum computing to store ...


In [22]:
# Export as an Excel

sample.to_excel(f"{DATA_PATH}{CORPUS_SAMPLE_FILE}.xlsx", sheet_name=SAMPLE_FILE, index=False)


## Comparing Sample with Population and concatenating

In [23]:
# Reading Excel files

population = pd.read_excel(f"{DATA_PATH}{POPULATION_SEPARATED_FILE}.xlsx", sheet_name=POPULATION_SEPARATED_FILE)
sample = pd.read_excel(f"{DATA_PATH}{CORPUS_SAMPLE_FILE}.xlsx", sheet_name=SAMPLE_FILE)


In [24]:
print(show_record_count(population[SNT_ID_FIELD]))

population.head()


Total records: 9017


Unnamed: 0.1,Unnamed: 0,snt_id,query_text,source_snt,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
0,0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Autonomous,1,able to act independently without requiring di...,Autonomous vehicles can navigate roads withou...,Autonomous vehicles can be used to deliver go...
1,1,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Robotics,1,"the science and technology of robots, their de...",Robotics is used to automate tasks that would...,Robotics can be used in factories to assemble...
2,2,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Automation,1,the use of technology to automate tasks that w...,Automation can be used to control the speed a...,Automation can be used in production lines to...
3,3,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Academic,0,of or relating to education or study at a scho...,Academic research is conducted at universitie...,Academic research can provide valuable insigh...
4,4,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Industrial,0,relating to or involving the industry or manuf...,Industrial robots are used in manufacturing s...,


In [25]:
print(show_record_count(sample[SNT_ID_FIELD]))

sample.head()


Total records: 589


Unnamed: 0,snt_id,query_text,source_snt_comp,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
0,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",End-to-End Encryption,2.0,A method whereby data is encrypted before it i...,Sending a message via an encrypted messaging app,"To protect sensitive data, such as financial i..."
1,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Encryption Key,2.0,A key used to encrypt and decrypt data.,A key used to encrypt a message before it is sent,To protect data from.
2,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Information-Centric Networking (ICN),1.0,A type of network architecture in which the da...,A network where data is stored in caches and c...,Caching data to improve access speed and relia...
3,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Content Caching,0.0,A method whereby data is stored on an intermed...,Downloading a web page from a local server ins...,To improve access speed and reduce bandwidth u...
4,G17.2_2893955699_1,qbit,Concatenated Greenberger‚ÄìHorne‚ÄìZeilinger (...,qubit,1.0,"A qubit is a unit of quantum information, whic...",An electron can be used as a qubit.,Qubits are used in quantum computing to store ...


In [26]:
# Format data

population[GPT_COMPLEX_WORD_FIELD] = population[GPT_COMPLEX_WORD_FIELD].str.capitalize()
sample[GPT_COMPLEX_WORD_FIELD] = sample[GPT_COMPLEX_WORD_FIELD].str.capitalize()


In [27]:
print(show_record_count(population[SNT_ID_FIELD]))

population.head()


Total records: 9017


Unnamed: 0.1,Unnamed: 0,snt_id,query_text,source_snt,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
0,0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Autonomous,1,able to act independently without requiring di...,Autonomous vehicles can navigate roads withou...,Autonomous vehicles can be used to deliver go...
1,1,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Robotics,1,"the science and technology of robots, their de...",Robotics is used to automate tasks that would...,Robotics can be used in factories to assemble...
2,2,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Automation,1,the use of technology to automate tasks that w...,Automation can be used to control the speed a...,Automation can be used in production lines to...
3,3,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Academic,0,of or relating to education or study at a scho...,Academic research is conducted at universitie...,Academic research can provide valuable insigh...
4,4,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Industrial,0,relating to or involving the industry or manuf...,Industrial robots are used in manufacturing s...,


In [28]:
print(show_record_count(sample[SNT_ID_FIELD]))

sample.head()


Total records: 589


Unnamed: 0,snt_id,query_text,source_snt_comp,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
0,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",End-to-end encryption,2.0,A method whereby data is encrypted before it i...,Sending a message via an encrypted messaging app,"To protect sensitive data, such as financial i..."
1,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Encryption key,2.0,A key used to encrypt and decrypt data.,A key used to encrypt a message before it is sent,To protect data from.
2,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Information-centric networking (icn),1.0,A type of network architecture in which the da...,A network where data is stored in caches and c...,Caching data to improve access speed and relia...
3,G14.2_2884788726_2,end to end encryption,"However, in information-centric networking (IC...",Content caching,0.0,A method whereby data is stored on an intermed...,Downloading a web page from a local server ins...,To improve access speed and reduce bandwidth u...
4,G17.2_2893955699_1,qbit,Concatenated Greenberger‚ÄìHorne‚ÄìZeilinger (...,Qubit,1.0,"A qubit is a unit of quantum information, whic...",An electron can be used as a qubit.,Qubits are used in quantum computing to store ...


In [29]:
# Get SAMPLE data not repeated in POPULATION

sample_rows_not_in_population = sample[~sample.set_index([SNT_ID_FIELD, QUERY_TEXT_FIELD, GPT_COMPLEX_WORD_FIELD]).index.isin(population.set_index([SNT_ID_FIELD, QUERY_TEXT_FIELD, GPT_COMPLEX_WORD_FIELD]).index)]

def replace_none(value):
  if isinstance(value, str):
    if re.match(r'^\s*(none|none\.)\s*$', value, flags=re.IGNORECASE):
      return None

  return value

sample_rows_not_in_population = sample_rows_not_in_population.applymap(replace_none)

print(show_record_count(sample_rows_not_in_population[SNT_ID_FIELD]))

sample_rows_not_in_population.head()


Total records: 8


Unnamed: 0,snt_id,query_text,source_snt_comp,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3
55,G17.1_2042591812_2,quantum computing,Majority gate is a fundamental Boolean operato...,,,,,
110,G17.4_2524344971_1,quantum applications,Quantum entanglement can offer a quadratic enh...,,,,,
233,T16.2_1760039095_1,guessing attack,Three-party key exchange protocol is one of th...,Guessing attack,2.0,A guessing attack is a type of attack in which...,A hacker attempts to guess an email address an...,Banks use guessing attack prevention methods s...
242,G07.2_2953000449_2,conspiracy theories,global warming induced by chemtrails or the li...,,,,,
244,T13.3_2052300809_9,imbalanced data,Problems in the performance of naive Bayes on ...,,,,,


In [30]:
# Concat results data to ZS

population = pd.concat([population, sample_rows_not_in_population])

print(show_record_count(population[SNT_ID_FIELD]))

population.head()


Total records: 9025


Unnamed: 0.1,Unnamed: 0,snt_id,query_text,source_snt,complex_word_gpt3,difficulty_gpt3,definition_gpt3,example_gpt3,use_case_gpt3,source_snt_comp
0,0.0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Autonomous,1.0,able to act independently without requiring di...,Autonomous vehicles can navigate roads withou...,Autonomous vehicles can be used to deliver go...,
1,1.0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Robotics,1.0,"the science and technology of robots, their de...",Robotics is used to automate tasks that would...,Robotics can be used in factories to assemble...,
2,2.0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Automation,1.0,the use of technology to automate tasks that w...,Automation can be used to control the speed a...,Automation can be used in production lines to...,
3,3.0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Academic,0.0,of or relating to education or study at a scho...,Academic research is conducted at universitie...,Academic research can provide valuable insigh...,
4,4.0,G11.1_2892036907_1,drones,"In the modern era of automation and robotics, ...",Industrial,0.0,relating to or involving the industry or manuf...,Industrial robots are used in manufacturing s...,,


In [31]:
# Export as an Excel

population.to_excel(f"{DATA_PATH}{CORPUS_POPULATION_FILE}.xlsx", sheet_name="CORPUS", index=False)
