In [7]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('../src/data/processed/stops.csv')

# Extract the first word of each street_name
# Split on space and take first word, handle NaN values
prefixes = df['street_name'].dropna().apply(lambda x: x.split()[0]).unique()

# Sort alphabetically
prefixes.sort()

print("Unique street name prefixes:")
for prefix in prefixes:
    print(f"- {prefix}")

Unique street name prefixes:
- 31
- 32
- Bdr
- Bulatan
- Central
- Cke
- Dalam
- Damansara
- Dataran
- Drop-off
- Duke
- Jalan
- Jin
- Jln5/46
- Jsm
- LDP
- Leboh
- Lebuh
- Lebuhraya
- Lingkaran
- Lingkungan
- Linkungan
- Lksa
- Lorong
- MRR2
- Medan
- Persiaran
- Pinggiran
- Pintasan
- Pssaas
- Puchong
- SPRINT
- Sbe
- Silk
- Solok
- Spg
- Sri
- Ss2/2
- Susur
- Taman
- Tengkat


In [20]:
import pandas as pd
import re
from name_rules import NAME_RULES

def is_all_consonants(word):
    # Remove any non-letter characters
    word = re.sub(r'[^a-zA-Z]', '', word)
    # Check if word has at least 2 letters and contains no vowels
    return len(word) >= 2 and not bool(re.search(r'[aeiouAEIOU]', word))

# Read the CSV file
df = pd.read_csv('../src/data/processed/stops.csv')

# Get existing uppercase words from NAME_RULES
existing_words = set(NAME_RULES['uppercase']) | set(NAME_RULES['street_types'].keys())

# Dictionary to store words and their contexts
new_abbr_contexts = {}

# Process each row
for _, row in df.iterrows():
    stop_name = str(row['stop_name'])
    street_name = str(row['street_name'])
    lat = row['latitude']
    lon = row['longitude']
    
    # Check words in stop name and street name
    for word in f"{stop_name} {street_name}".split():
        if (is_all_consonants(word.upper()) and 
            word.upper() not in existing_words):
            
            context = {
                'full_stop_name': stop_name,
                'street_name': street_name,
                'location': f"({lat}, {lon})"
            }
            
            if word not in new_abbr_contexts:
                new_abbr_contexts[word] = []
            new_abbr_contexts[word].append(context)

# Print results
print("New words containing only consonants (not in NAME_RULES) with context:")
for word, contexts in sorted(new_abbr_contexts.items()):
    print(f"\n{word}:")
    for ctx in contexts:
        print(f"  - Stop: {ctx['full_stop_name']}")
        print(f"    Street: {ctx['street_name']}")
        print(f"    Location: {ctx['location']}")

New words containing only consonants (not in NAME_RULES) with context:

(JKP:
  - Stop: Pusat Komuniti Rakyat (JKP Zon 24)
    Street: Jalan Sb Indah 2
    Location: (3.02966, 101.72828)

(Rk1):
  - Stop: Apartment Presint 17 (Rk1)
    Street: Jalan P17
    Location: (2.927556, 101.705905)

(TBS):
  - Stop: Terminal Bersepadu Selatan (TBS)
    Street: Jalan Terminal Selatan
    Location: (3.077336, 101.711494)

(tbc):
  - Stop: Taman Kota Perdana 3 (Opp)
    Street: Persiaran Kota Perdana (tbc)
    Location: (2.989965, 101.65967)

1/ks7:
  - Stop: Pangsapuri Sri Ayu
    Street: Lebuh Kebun Nenas 1/ks7
    Location: (2.994351, 101.492657)
  - Stop: Taman Camelia
    Street: Lebuh Kebun Nenas 1/ks7
    Location: (2.995259, 101.492097)

31/bb:
  - Stop: SMK Kota Kemuning (Utara)
    Street: Jalan Anggerik Doritis 31/bb
    Location: (2.9859331922853, 101.54094875077)

3rd:
  - Stop: Rivercity 3rd Mile
    Street: Jalan Sultan Azlan Shah
    Location: (3.180075, 101.683043)
  - Stop: 3rd M