# Error Detection Test Data Creation

First the ground truth table:

In [2]:
import pandas as pd

clean_data = [
    ["England", "Greenwich", 63500, "GBR"],
    ["United States of America", "Miami", 436000, "USA"],
    ["Spain", "Barcelona", 5850000, "ESP"],
    ["Japan", "Kyoto", 1460000, "JPN"],
    ["Philippines", "Manila", 13500000 ,"PHL"],
    ["Germany", "Hannover", 538000, "DEU"],
    ["Netherlands", "Groningen", 235000, "NLD"],
    ["Türkiye", "Istanbul", 15800000, "TUR"],
    ["South Africa", "Queenstown", 68900, "ZAF"],
    ["United States of America", "Phoenix", 1650000, "USA"],
    ["China", "Shanghai", 29200000, "CHN"],
    ["Hungary", "Budapest", 1760000, "HUN"],
    ["Egypt", "New Cairo", 297000, "EGY"],
    ["Mali", "Bamako", 2930000, "MLI"],
    ["Nigeria", "Lagos", 	13500000, "NGA"],
    ["Mexico", "Guadalajara", 5040000, "MEX"],
    ["Thailand", "Hat Yai", 149000, "THA"],
    ["United Arab Emirates", "Dubai", 3140000, "ARE"],
    ["Ireland", "Dublin", 1345402, "IRL"],
    ["Greece", "Athens", 3150000, "GRC"]]

clean_dataframe = pd.DataFrame(clean_data, columns=["Country", "City", "Population", "ISO 3166-1 Alpha3 Code"])
clean_dataframe

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,England,Greenwich,63500,GBR
1,United States of America,Miami,436000,USA
2,Spain,Barcelona,5850000,ESP
3,Japan,Kyoto,1460000,JPN
4,Philippines,Manila,13500000,PHL
5,Germany,Hannover,538000,DEU
6,Netherlands,Groningen,235000,NLD
7,Türkiye,Istanbul,15800000,TUR
8,South Africa,Queenstown,68900,ZAF
9,United States of America,Phoenix,1650000,USA


We construct several datasets with errors in them. The errors have different structures and degrees of difficulty. The first degree of difficulty is finding typos.

In [3]:
dirty_data_typos = [
    ["England", "Grenwich", 63500],
    ["United States of America", "Miami", 436000],
    ["Spain", "Barcellona", 5850000],
    ["Japan", "Kyoto", 1460000],
    ["Philippines", "Manila", 13500000],
    ["Germany", "Hanover", 538000],
    ["Netherlands", "Groningen", 235000],
    ["Türkiye", "Istanbul", 15800000],
    ["South Africa", "Queenstown", 68900],
    ["United States of America", "Phoenix", 1650000],
    ["China", "Shanghai", 29200000],
    ["Hunagry", "Budapest", 1760000],
    ["Ägypt", "New Cairo", 297000],
    ["Mali", "Bamako", 2930000],
    ["Nigerria", "Lagas", 	13500000],
    ["Mexico", "Guadalajara", 5040000],
    ["Thailand", "Hatyai", 149000],
    ["Unitedd Arab Emirates", "Dubai", 3140000],
    ["Ireland", "Dublin", 1345402],
    ["Greece", "Athens", 3150000]]

dirty_dataframe_typos = pd.DataFrame(dirty_data_typos, columns=["Country", "City", "Population"])
dirty_dataframe_typos["ISO 3166-1 Alpha3 Code"] = clean_dataframe["ISO 3166-1 Alpha3 Code"]

dirty_dataframe_typos

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,England,Grenwich,63500,GBR
1,United States of America,Miami,436000,USA
2,Spain,Barcellona,5850000,ESP
3,Japan,Kyoto,1460000,JPN
4,Philippines,Manila,13500000,PHL
5,Germany,Hanover,538000,DEU
6,Netherlands,Groningen,235000,NLD
7,Türkiye,Istanbul,15800000,TUR
8,South Africa,Queenstown,68900,ZAF
9,United States of America,Phoenix,1650000,USA


Let us print out all of the embedded errors.

In [4]:
def differences(clean_dataframe: pd.DataFrame, dirty_dataframe: pd.DataFrame) -> str:
  resultString = "Differences:"

  if(clean_dataframe.shape != dirty_dataframe.shape):
    raise ValueError("The two dataframes have to have the same shape to be comparable!")

  x_dim, y_dim = clean_dataframe.shape

  for x in range(x_dim):
    for y in range(y_dim):
      actual = clean_dataframe.iloc[x].iloc[y]
      dirty = dirty_dataframe.iloc[x].iloc[y]

      if(actual != dirty):
        resultString += f'\nDirty value: "%s", actual value: "%s" of row "%s"'%(dirty, actual, str(clean_dataframe.iloc[x].values))

  return resultString

print(differences(clean_dataframe, dirty_dataframe_typos))

Differences:
Dirty value: "Grenwich", actual value: "Greenwich" of row "['England' 'Greenwich' 63500 'GBR']"
Dirty value: "Barcellona", actual value: "Barcelona" of row "['Spain' 'Barcelona' 5850000 'ESP']"
Dirty value: "Hanover", actual value: "Hannover" of row "['Germany' 'Hannover' 538000 'DEU']"
Dirty value: "Hunagry", actual value: "Hungary" of row "['Hungary' 'Budapest' 1760000 'HUN']"
Dirty value: "Ägypt", actual value: "Egypt" of row "['Egypt' 'New Cairo' 297000 'EGY']"
Dirty value: "Nigerria", actual value: "Nigeria" of row "['Nigeria' 'Lagos' 13500000 'NGA']"
Dirty value: "Lagas", actual value: "Lagos" of row "['Nigeria' 'Lagos' 13500000 'NGA']"
Dirty value: "Hatyai", actual value: "Hat Yai" of row "['Thailand' 'Hat Yai' 149000 'THA']"
Dirty value: "Unitedd Arab Emirates", actual value: "United Arab Emirates" of row "['United Arab Emirates' 'Dubai' 3140000 'ARE']"


The next degree of difficulty simple semantic mismatching in the form of wrong citynames not part of the associated country.

In [5]:
dirty_data_wrong_cities = [
    ["England", "Berlin", 3645000],
    ["United States of America", "Miami", 436000],
    ["Spain", "Paris", 5850000],
    ["Japan", "Florida", 21780000],
    ["Philippines", "Manila", 13500000 ],
    ["Germany", "Hannover", 538000],
    ["Netherlands", "Groningen", 235000],
    ["Türkiye", "Nairobi", 4397000],
    ["South Africa", "Queenstown", 68900],
    ["United States of America", "Phoenix", 1650000],
    ["China", "Casablanca", 29200000],
    ["Hungary", "Budapest", 1760000],
    ["Egypt", "New Cairo", 297000],
    ["Mali", "Casablanca", 2930000],
    ["Nigeria", "Lagos", 	13500000],
    ["Mexico", "Guadalajara", 5040000],
    ["Thailand", "Hat Yai", 149000],
    ["United Arab Emirates", "Athens", 650000],
    ["Ireland", "Shanghai", 26320000],
    ["Greece", "Rome", 3150000]]


dirty_dataframe_wrong_cities = pd.DataFrame(dirty_data_wrong_cities, columns=["Country", "City", "Population"])
dirty_dataframe_wrong_cities["ISO 3166-1 Alpha3 Code"] = clean_dataframe["ISO 3166-1 Alpha3 Code"]
#dirty_data_wrong_cities["Population"] = clean_dataframe["Population"]
#dirty_data_wrong_cities["Country"] = clean_dataframe["Country"]

dirty_dataframe_wrong_cities

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,England,Berlin,3645000,GBR
1,United States of America,Miami,436000,USA
2,Spain,Paris,5850000,ESP
3,Japan,Florida,21780000,JPN
4,Philippines,Manila,13500000,PHL
5,Germany,Hannover,538000,DEU
6,Netherlands,Groningen,235000,NLD
7,Türkiye,Nairobi,4397000,TUR
8,South Africa,Queenstown,68900,ZAF
9,United States of America,Phoenix,1650000,USA


In [6]:
print(differences(clean_dataframe, dirty_dataframe_wrong_cities))

Differences:
Dirty value: "Berlin", actual value: "Greenwich" of row "['England' 'Greenwich' 63500 'GBR']"
Dirty value: "3645000", actual value: "63500" of row "['England' 'Greenwich' 63500 'GBR']"
Dirty value: "Paris", actual value: "Barcelona" of row "['Spain' 'Barcelona' 5850000 'ESP']"
Dirty value: "Florida", actual value: "Kyoto" of row "['Japan' 'Kyoto' 1460000 'JPN']"
Dirty value: "21780000", actual value: "1460000" of row "['Japan' 'Kyoto' 1460000 'JPN']"
Dirty value: "Nairobi", actual value: "Istanbul" of row "['Türkiye' 'Istanbul' 15800000 'TUR']"
Dirty value: "4397000", actual value: "15800000" of row "['Türkiye' 'Istanbul' 15800000 'TUR']"
Dirty value: "Casablanca", actual value: "Shanghai" of row "['China' 'Shanghai' 29200000 'CHN']"
Dirty value: "Casablanca", actual value: "Bamako" of row "['Mali' 'Bamako' 2930000 'MLI']"
Dirty value: "Athens", actual value: "Dubai" of row "['United Arab Emirates' 'Dubai' 3140000 'ARE']"
Dirty value: "650000", actual value: "3140000" of r

The last degree of difficulty is more advanced semantic mismatching in the form of wrong population numbers for the cities. We believe, that solving this task is more difficult then associating city names with countries. This task requires deeper knowlegde of the cities that probably occurs less frequently in casual conversations and chat completion models are trained to tackle casual chat.

In [7]:
dirty_data_wrong_population = [
    ["England", "Greenwich", 635000],
    ["United States of America", "Miami", 436000],
    ["Spain", "Barcelona", 7850000],
    ["Japan", "Kyoto", 1460000],
    ["Philippines", "Manila", 13500000 ],
    ["Germany", "Hannover", 53800],
    ["Netherlands", "Groningen", 235000],
    ["Türkiye", "Istanbul", 15800000],
    ["South Africa", "Queenstown", 9],
    ["United States of America", "Phoenix", 1650000000],
    ["China", "Shanghai", 29200000],
    ["Hungary", "Budapest", 2460000],
    ["Egypt", "New Cairo", 297000],
    ["Mali", "Bamako", 7930000],
    ["Nigeria", "Lagos", 	13500000],
    ["Mexico", "Guadalajara", 5040000],
    ["Thailand", "Hat Yai", 149000],
    ["United Arab Emirates", "Dubai", 6140000],
    ["Ireland", "Dublin", 1345402],
    ["Greece", "Athens", 31500]]


dirty_dataframe_wrong_population = pd.DataFrame(dirty_data_wrong_population, columns=["Country", "City", "Population"])
dirty_dataframe_wrong_population["ISO 3166-1 Alpha3 Code"] = clean_dataframe["ISO 3166-1 Alpha3 Code"]
#dirty_dataframe_wrong_population["Country"] = clean_dataframe["Country"]

dirty_dataframe_wrong_population

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,England,Greenwich,635000,GBR
1,United States of America,Miami,436000,USA
2,Spain,Barcelona,7850000,ESP
3,Japan,Kyoto,1460000,JPN
4,Philippines,Manila,13500000,PHL
5,Germany,Hannover,53800,DEU
6,Netherlands,Groningen,235000,NLD
7,Türkiye,Istanbul,15800000,TUR
8,South Africa,Queenstown,9,ZAF
9,United States of America,Phoenix,1650000000,USA


In [8]:
print(differences(clean_dataframe, dirty_dataframe_wrong_population))

Differences:
Dirty value: "635000", actual value: "63500" of row "['England' 'Greenwich' 63500 'GBR']"
Dirty value: "7850000", actual value: "5850000" of row "['Spain' 'Barcelona' 5850000 'ESP']"
Dirty value: "53800", actual value: "538000" of row "['Germany' 'Hannover' 538000 'DEU']"
Dirty value: "9", actual value: "68900" of row "['South Africa' 'Queenstown' 68900 'ZAF']"
Dirty value: "1650000000", actual value: "1650000" of row "['United States of America' 'Phoenix' 1650000 'USA']"
Dirty value: "2460000", actual value: "1760000" of row "['Hungary' 'Budapest' 1760000 'HUN']"
Dirty value: "7930000", actual value: "2930000" of row "['Mali' 'Bamako' 2930000 'MLI']"
Dirty value: "6140000", actual value: "3140000" of row "['United Arab Emirates' 'Dubai' 3140000 'ARE']"
Dirty value: "31500", actual value: "3150000" of row "['Greece' 'Athens' 3150000 'GRC']"


As a final challenge, let us combine all three types of errors in one dataframe.

In [9]:
dirty_data_all_errors = [
    ["England", "Greenwich", 63500],
    ["United States of America", "Miamia", 436000],
    ["France", "Barcelona", 5850000],
    ["Japan", "Kyoto", 146000],
    ["Autralia", "Manila", 13500000],
    ["Germany", "Hannover", 538000],
    ["Netherlands", "Groningen", 235000],
    ["Türkiye", "Istanbul", 15800000],
    ["South Africa", "Queenstown", 68900],
    ["United States of America", "Phoenix", 1650000],
    ["Japan", "Shanghai", 29200000],
    ["Hungary", "Budapest", 1760000],
    ["Egypt", "New Kairo", 297000],
    ["Mali", "Bamako", 2930000],
    ["Nigeria", "La gos", 	13500000],
    ["Mexico", "Guadalajara", 5040000],
    ["Thailand", "Hat Yai", 149000],
    ["United Arab Emirates", "Dubai", 31400],
    ["Ireland", "Dublin", 1345402],
    ["Greece", "Athens", 315000000]]


dirty_dataframe_all_errors = pd.DataFrame(dirty_data_all_errors, columns=["Country", "City", "Population"])
dirty_dataframe_all_errors["ISO 3166-1 Alpha3 Code"] = clean_dataframe["ISO 3166-1 Alpha3 Code"]
#dirty_dataframe_all_errors["Country"] = clean_dataframe["Country"]

dirty_dataframe_all_errors

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,England,Greenwich,63500,GBR
1,United States of America,Miamia,436000,USA
2,France,Barcelona,5850000,ESP
3,Japan,Kyoto,146000,JPN
4,Autralia,Manila,13500000,PHL
5,Germany,Hannover,538000,DEU
6,Netherlands,Groningen,235000,NLD
7,Türkiye,Istanbul,15800000,TUR
8,South Africa,Queenstown,68900,ZAF
9,United States of America,Phoenix,1650000,USA


In [10]:
print(differences(clean_dataframe, dirty_dataframe_all_errors))

Differences:
Dirty value: "Miamia", actual value: "Miami" of row "['United States of America' 'Miami' 436000 'USA']"
Dirty value: "France", actual value: "Spain" of row "['Spain' 'Barcelona' 5850000 'ESP']"
Dirty value: "146000", actual value: "1460000" of row "['Japan' 'Kyoto' 1460000 'JPN']"
Dirty value: "Autralia", actual value: "Philippines" of row "['Philippines' 'Manila' 13500000 'PHL']"
Dirty value: "Japan", actual value: "China" of row "['China' 'Shanghai' 29200000 'CHN']"
Dirty value: "New Kairo", actual value: "New Cairo" of row "['Egypt' 'New Cairo' 297000 'EGY']"
Dirty value: "La gos", actual value: "Lagos" of row "['Nigeria' 'Lagos' 13500000 'NGA']"
Dirty value: "31400", actual value: "3140000" of row "['United Arab Emirates' 'Dubai' 3140000 'ARE']"
Dirty value: "315000000", actual value: "3150000" of row "['Greece' 'Athens' 3150000 'GRC']"


# Example Datasets

We need custom examples for these custom dataframes to use for few shot.

In [11]:
# clean 
examples_clean_data = [
    ["Vietnam", "Hanoi", 8050000, "VNM"],
    ["New Zealand", "Wellington", 441000, "NZL"],
    ["Saint Lucia", "Castries", 20000, "LCA"],
    ["Uruguay", "Montevideo", 1950000, "URY"],
    ["Australia", "Canberra", 396000, "AUS"]
]
examples_clean_dataframe = pd.DataFrame(examples_clean_data, columns=["Country", "City", "Population", "ISO 3166-1 Alpha3 Code"])
examples_clean_dataframe

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,Vietnam,Hanoi,8050000,VNM
1,New Zealand,Wellington,441000,NZL
2,Saint Lucia,Castries,20000,LCA
3,Uruguay,Montevideo,1950000,URY
4,Australia,Canberra,396000,AUS


In [12]:
# typos
examples_dirty_data_typos = [
    ["Vietnan", "Hanoi", 8050000, "VNM"],
    ["New Zealand", "Wellington", 441000, "NZL"],
    ["Saint Lucia", "Castryes", 20000, "LCA"],
    ["Uruguay", "Montevideo", 1950000, "URY"],
    ["Australia", "Canberra", 396000, "AUS"]
]
examples_dataframe_typos = pd.DataFrame(examples_dirty_data_typos, columns=["Country", "City", "Population", "ISO 3166-1 Alpha3 Code"])
examples_dataframe_typos

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,Vietnan,Hanoi,8050000,VNM
1,New Zealand,Wellington,441000,NZL
2,Saint Lucia,Castryes,20000,LCA
3,Uruguay,Montevideo,1950000,URY
4,Australia,Canberra,396000,AUS


In [15]:
print(differences(examples_clean_dataframe, examples_dataframe_typos))

Differences:
Dirty value: "Vietnan", actual value: "Vietnam" of row "['Vietnam' 'Hanoi' 8050000 'VNM']"
Dirty value: "Castryes", actual value: "Castries" of row "['Saint Lucia' 'Castries' 20000 'LCA']"


In [14]:
# wrong cities
examples_dirty_data_wrong_cities = [
    ["Vietnam", "Paris", 8050000, "VNM"],
    ["New Zealand", "Wellington", 441000, "NZL"],
    ["Saint Lucia", "Castries", 20000, "LCA"],
    ["Estonia", "Montevideo", 1950000, "URY"],
    ["Australia", "Canberra", 396000, "AUS"]
]
examples_dirty_dataframe_wrong_cities = pd.DataFrame(examples_dirty_data_wrong_cities, columns=["Country", "City", "Population", "ISO 3166-1 Alpha3 Code"])
examples_dirty_dataframe_wrong_cities

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,Vietnam,Paris,8050000,VNM
1,New Zealand,Wellington,441000,NZL
2,Saint Lucia,Castries,20000,LCA
3,Estonia,Montevideo,1950000,URY
4,Australia,Canberra,396000,AUS


In [31]:
# wrong population
examples_dirty_data_wrong_population = [
    ["Vietnam", "Hanoi", 8050000, "VNM"],
    ["New Zealand", "Wellington", 441000, "NZL"],
    ["Saint Lucia", "Castries", 200000, "LCA"],
    ["Uruguay", "Montevideo", 1, "URY"],
    ["Australia", "Canberra", 396000, "AUS"]
]
examples_dirty_dataframe_wrong_population = pd.DataFrame(examples_dirty_data_wrong_population, columns=["Country", "City", "Population", "ISO 3166-1 Alpha3 Code"])
examples_dirty_dataframe_wrong_population

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,Vietnam,Hanoi,8050000,VNM
1,New Zealand,Wellington,441000,NZL
2,Saint Lucia,Castries,200000,LCA
3,Uruguay,Montevideo,1,URY
4,Australia,Canberra,396000,AUS


In [32]:
# everything wrong
examples_dirty_data_all_errors = [
    ["Vietnam", "Paris", 8050000, "VNM"],
    ["New Zealand", "Welington", 441000, "NZL"],
    ["Saint Lucia", "Castries", 200000, "LCA"],
    ["Estonia", "Montevideeo", 1, "URY"],
    ["Australia", "Canberra", 396000, "AUS"]
]
examples_dirty_dataframe_all_errors = pd.DataFrame(examples_dirty_data_all_errors, columns=["Country", "City", "Population", "ISO 3166-1 Alpha3 Code"])
examples_dirty_dataframe_all_errors

Unnamed: 0,Country,City,Population,ISO 3166-1 Alpha3 Code
0,Vietnam,Paris,8050000,VNM
1,New Zealand,Welington,441000,NZL
2,Saint Lucia,Castries,200000,LCA
3,Estonia,Montevideeo,1,URY
4,Australia,Canberra,396000,AUS


# Download

In [35]:

# custom dataset
clean_dataframe.to_csv("clean_dataframe.csv", index=False)
dirty_dataframe_typos.to_csv("dirty_dataframe_typos.csv", index=False)
dirty_dataframe_wrong_cities.to_csv("dirty_data_wrong_cities.csv", index=False)
dirty_dataframe_all_errors.to_csv("dirty_dataframe_all_errors.csv", index=False)

# corresponding examples
examples_clean_dataframe.to_csv("examples_clean_dataframe.csv", index=False)
examples_dataframe_typos.to_csv("examples_dataframe_typos.csv", index=False)
examples_dirty_dataframe_wrong_cities.to_csv("examples_dirty_dataframe_wrong_cities.csv", index=False)
examples_dirty_dataframe_all_errors.to_csv("examples_dirty_dataframe_all_errors.csv", index=False)
