In [None]:
import geopandas as gpd
import os
import pandas as pd

# Specify the folder path containing .shp files
folder_path = './data/useful-poi'

dataframes = {}

# Iterate through all .shp files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".shp"):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        # Read the .shp file using geopandas
        gdf = gpd.read_file(file_path)
        # Convert to Pandas DataFrame and remove the geometry column
        df = pd.DataFrame(gdf.drop(columns='geometry'))
        # Use the filename without extension as the dictionary key
        dataframes[os.path.splitext(filename)[0]] = df

# Assign the dataframes to specific variables
landuse = dataframes['gis_osm_landuse_a_free_1']
buildings = dataframes['gis_osm_buildings_a_free_1']
places = dataframes['gis_osm_places_free_1']
pois = dataframes['gis_osm_pois_free_1']
roads = dataframes['gis_osm_roads_free_1']
transport = dataframes['gis_osm_transport_free_1']

# Extract columns we need
landuse_subset = landuse[['fclass', 'osm_id', 'name']]
buildings_subset = buildings[['fclass', 'osm_id', 'name']]
places_subset = places[['fclass', 'osm_id', 'name']]
pois_subset = pois[['fclass', 'osm_id', 'name']]
roads_subset = roads[['fclass', 'osm_id', 'name']]
transport_subset = transport[['fclass', 'osm_id', 'name']]

# Merge all DataFrames
frames = [landuse_subset, buildings_subset, places_subset, pois_subset, roads_subset, transport_subset]
combined_osm = pd.concat(frames)

# Drop rows with missing 'name' values
combined_osm = combined_osm.dropna(subset=['name'])

In [None]:
combined_osm.head(10)

In [None]:
def clean_name(name):
    # Remove or replace characters that may cause formatting issues
    for char in ['"', "#", "$", "%", " - ", ",", "`", "(", ")", "*", "+", ".", "?"]:
        name = name.replace(char, "")
    return name.strip()

combined_osm['location_name'] = combined_osm['name'].apply(clean_name)

In [None]:
combined_osm.shape

In [None]:
combined_osm.head(10)

In [None]:
# Filter the 'fclass' field
filtered_osm = combined_osm[~combined_osm['fclass'].isin(['convince', 'clothes', 'supermarket', 'pharmacy', 'fast_food', 'cafe', 'bar'])]

# Filter rows where 'fclass' is 'building' and the 'name' field contains only one word
filtered_osm = filtered_osm[~((filtered_osm['fclass'] == 'building') & (filtered_osm['name'].str.split().str.len() == 1))]

# Filter rows where the 'name' field contains only numbers
filtered_osm = filtered_osm[~filtered_osm['name'].str.match(r'^\d+$')]

location_set = set(filtered_osm['location_name'])

import nltk
from nltk import word_tokenize

def match_longest_place_names(text, location_set):
    tokens = word_tokenize(text)
    matched_names = []
    for i in range(len(tokens)):
        for j in range(len(tokens), i, -1):
            candidate = ' '.join(tokens[i:j])
            if candidate in location_set and not any(candidate in name for name in matched_names):
                matched_names.append(candidate)
                break
    return matched_names

In [None]:
tt = '''
Finsbury Park is a friendly melting pot community composed of Turkish, French, Spanish, Middle Eastern,
Irish and English families. 
We have a wonderful variety of international restaurants directly under us on Stroud Green Road.
And there are many shops and large Tescos supermarket right next door. 
But you can also venture up to Crouch End and along Greens Lanes where there will endless choice 
of Turkish and Middle Eastern cuisines.
'''

match_longest_place_names(tt, location_set)


In [None]:
# neighborhood_overview

In [None]:
import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters except for alphanumeric, spaces, periods, and commas. Note that we included \. and , in the exclusion set, so they will not be removed
    text = re.sub(r'[^\w\s.,]', '', text)
    # Remove numbers (if needed)
    # text = re.sub(r'\d+', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    return text

# Load the CSV file
train_data = pd.read_csv('listings_d.csv')

# Get the 'neighborhood_overview' column as a list
text_data = train_data['neighborhood_overview'].tolist()

# Clean the text data
ok_data = []
for i in text_data:
    try:
        ok_data.append(clean_text(i))
    except:
        pass

# Print the length of the cleaned data
len(ok_data)


In [None]:
sum_len = 0
ok_data = []
for i in text_Data:
    try:
        sum_len += len(i.split(' '))
        ok_data.append(i)
    except:
        pass
print(sum_len/len(ok_data), len(ok_data))

In [None]:
from tqdm import tqdm
texts = []
labels = []
for tt in tqdm(ok_data):
    locationsi = match_longest_place_names(tt, location_set)
    if locationsi:
        texts.append(tt)
        labels.append(locationsi)

In [None]:
for text, label in zip(texts, labels):
    print("Text:", text)
    print("Labels:", label)
    print("\n")

In [None]:
print(len(texts), len(labels))

In [None]:
import pickle

with open('texts.pkl', 'wb') as f:
    pickle.dump(texts, f)
with open('labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

# Trainning Data Build

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('../bert-base-uncased')

def tokenize_and_label(text, location_list, len_tokens):
    labels = ['O'] * len_tokens
    for location in location_list:
        # Find the start index of the location in the tokens
        start_idx = text.find(location)
        if start_idx != -1:
            # Convert start index to token index
            start_token_idx = len(tokenizer.tokenize(text[:start_idx]))
            # print(text, start_idx, text[:start_idx], location, start_token_idx)
            if labels[start_token_idx] == 'O':
                # Mark the start of the location with 'B-LOC'
                labels[start_token_idx] = 'B-LOC'
                # Mark the subsequent tokens of the location with 'I-LOC'
                for i in range(start_token_idx + 1, start_token_idx + len(tokenizer.tokenize(location))):
                    labels[i] = 'I-LOC'
    
    return labels

In [None]:
tt = 'Finsbury Park is a friendly melting pot community composed of Turkish, French, Spanish, Middle Eastern, Irish and English families. <br />We have a wonderful variety of international restaurants directly under us on Stroud Green Road. And there are many shops and large Tescos supermarket right next door. <br /><br />But you can also venture up to Crouch End and along Greens Lanes where there will endless choice of Turkish and Middle Eastern cuisines.'
locationsi = match_longest_place_names(tt, location_set)
tokens = tokenizer.tokenize(tt)
tokenize_and_label(tt, locationsi, len(tokens))

In [None]:
import tqdm

all_label_bio = []
for text, location in tqdm.tqdm(zip(texts, labels)):
    tokens = tokenizer.tokenize(text)
    labelsbio = tokenize_and_label(text, location, len(tokens))
    all_label_bio.append(labelsbio)

In [None]:
print(len(texts), len(all_label_bio))

In [None]:
import pickle

with open('all_label_bio.pkl', 'wb') as f:
    pickle.dump(all_label_bio, f)