In [1]:
import pandas as pd
import json
import os
import re

In [None]:
DATA_DIR = os.path.join(
	os.getcwd(),
	"data/data/downloaded_e2e_dataset"
)

for filename in os.listdir(DATA_DIR):
	filepath = os.path.join(DATA_DIR, filename)
	if not filename.endswith(".json"):
		continue
	with open(filepath, "r") as f:
		json_file = json.load(f)
	if json_file.get("url") == "https://github.com/tuetschek/e2e-cleaning/raw/master/cleaned-data/train-fixed.no-ol.csv":
		data_filepath = filepath.replace(".json", "")

In [15]:
data_df = pd.read_csv(data_filepath)
data_df.head()

Unnamed: 0,mr,ref,fixed,orig_mr
0,"name[The Eagle], eatType[coffee shop], food[Ja...",The Eagle is a low rated coffee shop near Burg...,0,"name[The Eagle], eatType[coffee shop], food[Ja..."
1,"name[The Mill], eatType[coffee shop], food[Fre...",Located near The Sorrento is a French Theme ea...,1,"name[The Mill], eatType[coffee shop], food[Fre..."
2,"name[Loch Fyne], food[French], area[riverside]...","For luxurious French food, the Loch Fyne is lo...",1,"name[Loch Fyne], food[French], customer rating..."
3,"name[The Rice Boat], eatType[restaurant], food...",The Rice Boat is an adult French restaurant wi...,1,"name[The Rice Boat], food[French], customer ra..."
4,"name[The Wrestlers], eatType[coffee shop], foo...",The Wrestlers coffee shop serves Japanese food...,0,"name[The Wrestlers], eatType[coffee shop], foo..."


In [None]:
for i, row in data_df.iterrows():
	if row["mr"] != row["orig_mr"]:
		print(row["mr"])
		print(row["orig_mr"])
		print(row["ref"])
		break

name[The Mill], eatType[coffee shop], food[French], area[riverside], near[The Sorrento]
name[The Mill], eatType[coffee shop], food[French], priceRange[£20-25], area[riverside], near[The Sorrento]
Located near The Sorrento is a French Theme eatery and coffee shop called The Mill, with a price range at £20-£25 it is in the riverside area.


In [16]:
data_df.drop(["mr"], inplace=True, axis=1)
data_df.head()

Unnamed: 0,ref,fixed,orig_mr
0,The Eagle is a low rated coffee shop near Burg...,0,"name[The Eagle], eatType[coffee shop], food[Ja..."
1,Located near The Sorrento is a French Theme ea...,1,"name[The Mill], eatType[coffee shop], food[Fre..."
2,"For luxurious French food, the Loch Fyne is lo...",1,"name[Loch Fyne], food[French], customer rating..."
3,The Rice Boat is an adult French restaurant wi...,1,"name[The Rice Boat], food[French], customer ra..."
4,The Wrestlers coffee shop serves Japanese food...,0,"name[The Wrestlers], eatType[coffee shop], foo..."


In [None]:
for i, row in data_df.head(10).iterrows():
	print("***")
	print(f"ref: {row["ref"]}")
	print(f"mr: {row["orig_mr"]}")

***
ref: The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.
mr: name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]
***
ref: Located near The Sorrento is a French Theme eatery and coffee shop called The Mill, with a price range at £20-£25 it is in the riverside area.
mr: name[The Mill], eatType[coffee shop], food[French], priceRange[£20-25], area[riverside], near[The Sorrento]
***
ref: For luxurious French food, the Loch Fyne is located by the river next to The Rice Boat.
mr: name[Loch Fyne], food[French], customer rating[high], area[riverside], near[The Rice Boat]
***
ref: The Rice Boat is an adult French restaurant with high customer rating  located in the Riverside area.
mr: name[The Rice Boat], food[French], customer rating[average], area[riverside], familyFriendly[no]
***
ref: The Wrestlers 

In [None]:
for r, row in data_df.iterrows():
	if "name[Taste of Cambridge]" in row["orig_mr"] and "eatType[restaurant]" in row["orig_mr"]:
		print(row["ref"])

Taste of Cambridge is a restaurant with a customer rating of 3 out of 5 and and a price range of £20-£25
Taste of Cambridge is a restaurant ranging at less than 20 pounds. It has low ratings.
Moderately priced, Taste of Cambridge is a restaurant with an unfortunately low customer rating.
With prices ranging less than £20, Taste of Cambridge is a restaurant with average customer ratings.
Taste of Cambridge is an average rated and cheap restaurant option.
Taste of Cambridge is a low rated restaurant offering food for less than £20.
For a cheap restaurant with a customer rating of 5 out of 5, try Taste of Cambridge.
Taste of Cambridge is a restaurant that has a price range of £20-25, and a customer rating of 3 out of 5.
Taste of Cambridge is a cheap restaurant. It has a customer rating of 5 out of 5.
Taste of Cambridge is a low rated restaurant in the price range of less than £20.
Taste of Cambridge, a restaurant in the £30 or more price range, has a 5 out of 5 customer rating.
A restaura

In [None]:
def get_attributes(df: pd.DataFrame):
	attributes = set()
	area_count = 0
	food_count = 0
	near_count = 0
	price_range_count = 0
	customer_rating = 0
	family_friendly_count = 0
	eat_type_count = 0
	name_count = 0
	for i, row in df.iterrows():
		text = row["mr"]
		
		for element in text.split(","):
			attribute = element.split("[")[0].strip()
			if attribute == "food":
				food_count += 1
			elif attribute == "near":
				near_count += 1
			elif attribute == "priceRange":
				price_range_count += 1
			elif attribute == "area":
				area_count += 1
			elif attribute == "customer rating":
				customer_rating += 1
			elif attribute == "familyFriendly":
				family_friendly_count += 1
			elif attribute == "eatType":
				eat_type_count += 1
			elif attribute == "name":
				name_count += 1
			attributes.add(attribute)

	print(f"Did not find area attribute in {len(df) - area_count} rows")
	print(f"Did not find food attribute in {len(df) - food_count} rows")
	print(f"Did not find near attribute in {len(df) - near_count} rows")
	print(f"Did not find price_range attribute in {len(df) - price_range_count} rows")
	print(f"Did not find customer_rating attribute in {len(df) - customer_rating} rows")
	print(f"Did not find family_friendly attribute in {len(df) - family_friendly_count} rows")
	print(f"Did not find eat_type attribute in {len(df) - eat_type_count} rows")
	print(f"Did not find name attribute in {len(df) - name_count} rows")

	return attributes

In [20]:
print(len(data_df))

33525


In [17]:
get_attributes(data_df)

Did not find area attribute in 13520 rows
Did not find food attribute in 4551 rows
Did not find near attribute in 19240 rows
Did not find price_range attribute in 10004 rows
Did not find customer_rating attribute in 10253 rows
Did not find family_friendly attribute in 12762 rows
Did not find eat_type attribute in 17645 rows
Did not find name attribute in 0 rows


{'area',
 'customer rating',
 'eatType',
 'familyFriendly',
 'food',
 'name',
 'near',
 'priceRange'}

In [37]:
def get_unique_attribute_values(df: pd.DataFrame, attr: str):
	attrs = set()
	for _, row in df.iterrows():
		pattern = rf"{attr}\[(?P<attr_type>[^\]]+)\]"
		pattern_match = re.search(pattern, row["orig_mr"])
		if pattern_match:
			attr_type = pattern_match.group("attr_type").strip()
			attrs.add(attr_type)

	return attrs

In [None]:
name_attrs = get_unique_attribute_values(data_df, "name")

with open("data/unique_names.txt", "w") as f:
	f.write("\n".join(name_attrs))

In [15]:
data_df.to_csv("data/train.csv")