In [4]:
import pandas as pd

df = pd.read_csv('squad_questions_and_types_500.csv', index_col=None)
df.head()

Unnamed: 0,Question,Confirmation,Factoid,List,Causal,Hypothetical,Complex
0,For what are jeans used to make?,0,1,0,0,0,0
1,What type of industry produced a growing chain...,0,1,0,0,0,0
2,Where is a tower almost never found on Greek c...,0,1,0,0,0,0
3,In what year did Joseph Stalin come to power?,0,1,0,0,0,0
4,Along with cabaret striptease bands and drama ...,0,1,0,0,0,0


In [5]:
print(df['Confirmation'].sum(),df['Factoid'].sum(),df['List'].sum(),df['Causal'].sum(),df['Hypothetical'].sum(),df['Complex'].sum())

1 478 5 43 0 24


# we can see the annotation results of some SQuAD data. Most of them are of Factoid type, and the results are very uneven.

# However, we tried using GPT3.5 to generate results and found that the results were very simplistic and not conducive to the generalization of the model.However, the effect of using GPT4 will be very obvious, so we generate it for the categories that are obviously missing.

["Is Paris the capital of France?", 1, 0, 0, 0, 0, 0]  
["Is the Eiffel Tower located in France?", 1, 0, 0, 0, 0, 0]  
["Is French spoken in France?", 1, 0, 0, 0, 0, 0]  
["Is the Louvre Museum in France?", 1, 0, 0, 0, 0, 0]  
["Is the Arc de Triomphe located in France?", 1, 0, 0, 0, 0, 0]  
["Is French the official language of France?", 1, 0, 0, 0, 0, 0]  
["Is the Louvre Museum the most visited museum in France?", 1, 0, 0, 0, 0, 0]  
["Is Paris known as the City of Love?", 1, 0, 0, 0, 0, 0]  
["Is the Seine River in France?", 1, 0, 0, 0, 0, 0]  
["Is Versailles a famous palace in France?", 1, 0, 0, 0, 0, 0]  

# Then most Factoid questions start with "what," "which," "when," "who," or "how," so we divide the large data set into those that contain these keywords and those that do not. The words are divided into two parts, and then sampled separately to generate the result data set.

In [106]:
import pandas as pd


df = pd.read_csv('squad_questions_and_types_balanced_500.csv', index_col=None)
df = df.drop_duplicates(subset=["Question"])
df.head()

Unnamed: 0,Question,Confirmation,Factoid,List,Causal,Hypothetical,Complex
0,When did Tito create the Second Proletarian Br...,0,1,0,0,0,0
1,Who discovered non-radio uses for early LED de...,0,1,0,0,0,0
2,What dynasty ruled Germany starting in 1024?,0,1,0,0,0,0
3,Sufficient domestic renewable resources exist ...,0,0,0,1,0,0
4,What did astronomers during this area believe ...,0,1,0,0,0,0


In [107]:
print(df['Confirmation'].sum(),df['Factoid'].sum(),df['List'].sum(),df['Causal'].sum(),df['Hypothetical'].sum(),df['Complex'].sum())
print(len(df))

463 1437 127 1601 136 385
3332


In [108]:
row_sums = df[['Confirmation', 'Factoid', 'List', 'Causal', 'Hypothetical', 'Complex']].sum(axis=1)
count_greater_than_one = (row_sums > 1).sum()
print(count_greater_than_one)
print(count_greater_than_one/len(df))

749
0.22478991596638656


# It can be seen that the categories are not very balanced, so we further restrict it so that the size of the training set is around 1000, so some of the factoid and casual types will be filtered out.

In [109]:
df_filtered_1 = df[(df['Factoid'] == 1)]
count_filtered_rows = len(df_filtered_1)
print(count_filtered_rows)

1437


In [110]:
df_filtered_2 = df[(df['Factoid'] == 1) & (df['Confirmation']==0)& (df['List']==0)& (df['Causal']==0)& (df['Hypothetical']==0)& (df['Complex']==0)]
count_filtered_rows = len(df_filtered_2)
print(count_filtered_rows)

886


In [111]:
df_filtered_3 = df_filtered_1[~df_filtered_1.index.isin(df_filtered_2.index)]
count_filtered_rows = len(df_filtered_3)
print(count_filtered_rows)

551


In [112]:
df_sampled = df_filtered_2.sample(n=600, random_state=42)
df = df.drop(df_sampled.index)
df_sampled = df_filtered_3.sample(n=300, random_state=42)
df = df.drop(df_sampled.index)

In [113]:
df_filtered_3 = df[(df['Causal'] == 1)]
count_filtered_rows = len(df_filtered_1)
print(count_filtered_rows)

1437


In [114]:
df_filtered_4 = df[(df['Causal'] == 1) & (df['Confirmation']==0)& (df['List']==0)& (df['Factoid']==0)& (df['Hypothetical']==0)& (df['Complex']==0)]
count_filtered_rows = len(df_filtered_4)
print(count_filtered_rows)

1003


In [115]:
df_filtered_5 = df_filtered_3[~df_filtered_3.index.isin(df_filtered_4.index)]
count_filtered_rows = len(df_filtered_5)
print(count_filtered_rows)

333


In [116]:
df_sampled = df_filtered_4.sample(n=800, random_state=42)
df = df.drop(df_sampled.index)
df_sampled = df_filtered_5.sample(n=100, random_state=42)
df = df.drop(df_sampled.index)

In [117]:
print(df['Confirmation'].sum(),df['Factoid'].sum(),df['List'].sum(),df['Causal'].sum(),df['Hypothetical'].sum(),df['Complex'].sum())
print(len(df))

451 464 120 436 133 306
1532


In [118]:
row_sums = df[['Confirmation', 'Factoid', 'List', 'Causal', 'Hypothetical', 'Complex']].sum(axis=1)
count_greater_than_one = (row_sums > 1).sum()
print(count_greater_than_one)
print(count_greater_than_one/len(df))

349
0.22780678851174935


# this will be the question type check model dataset.

In [119]:
df.to_csv('result.csv',index=False)