In [15]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import torch
from transformers import BertModel, BertTokenizer
import numpy as np

In [16]:
# Expanded list of 100 categories
categories = [
    "Restaurant", "Swimming Pool", "Tourist Attraction", "Museum", "Hotel", 
    "Park", "Hospital", "School", "University", "Library", 
    "Shopping Mall", "Cinema", "Zoo", "Theater", "Stadium", 
    "Airport", "Train Station", "Bus Station", "Beach", "Amusement Park",
    "Bakery", "Bar", "Beauty Salon", "Bicycle Store", "Book Store", 
    "Bowling Alley", "Cafe", "Campground", "Car Dealer", "Car Rental", 
    "Car Repair", "Car Wash", "Casino", "Cemetery", "Church", 
    "City Hall", "Clothing Store", "Convenience Store", "Courthouse", "Dentist", 
    "Department Store", "Doctor", "Electronics Store", "Embassy", "Fire Station", 
    "Florist", "Funeral Home", "Furniture Store", "Gas Station", "Gym", 
    "Hair Care", "Hardware Store", "Home Goods Store", "Insurance Agency", "Jewelry Store", 
    "Laundry", "Lawyer", "Liquor Store", "Local Government Office", "Locksmith", 
    "Lodging", "Meal Delivery", "Meal Takeaway", "Mosque", "Movie Rental", 
    "Movie Theater", "Moving Company", "Night Club", "Painter", "Pet Store", 
    "Pharmacy", "Physiotherapist", "Plumber", "Police", "Post Office", 
    "Real Estate Agency", "Restaurant", "Roofing Contractor", "RV Park", "School", 
    "Shoe Store", "Shopping Mall", "Spa", "Stadium", "Storage", 
    "Store", "Subway Station", "Supermarket", "Synagogue", "Taxi Stand", 
    "Train Station", "Travel Agency", "University", "Veterinary Care", "Zoo"
]

In [17]:
# Function to drop unnecessary columns
def drop_columns(df):
    columns_to_drop = [
        'Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
        'city', 'verified', 'state', 'Friday_morning', 'Friday_afternoon', 'Friday_evening',
        'Saturday_morning', 'Saturday_afternoon', 'Saturday_evening', 'Sunday_morning',
        'Sunday_afternoon', 'Sunday_evening', 'Monday_morning', 'Monday_afternoon',
        'Monday_evening', 'Tuesday_morning', 'Tuesday_afternoon', 'Tuesday_evening',
        'Wednesday_morning', 'Wednesday_afternoon', 'Wednesday_evening', 'Thursday_morning',
        'Thursday_afternoon', 'Thursday_evening', 'geo_cluster', 'place_id', 'place_link'
    ]
    df = df.drop(columns=columns_to_drop)
    return df


# Function to preprocess the data
def preprocess_data(df):
    df['phone_number'] = df['phone_number'].apply(lambda x: 1 if pd.notnull(x) and x != '' else 0)
    df['website'] = df['website'].apply(lambda x: 1 if pd.notnull(x) and x != '' else 0)
    def clean_name(name):
        name = re.sub(r'[,/_]', ' ', name)
        name = re.sub(r'(?<!^)(?=[A-Z])', ' ', name)
        name = re.sub(r'\d+', '', name)
        name = name.lower()
        return name
    df['clean_name'] = df['name'].apply(clean_name)
    return df


In [18]:

# Function to get embeddings
def get_embeddings(texts, model, tokenizer, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

In [19]:
# Function to assign categories based on similarity
def assign_category(embedding, category_embeddings, threshold=0.4):
    similarities = cosine_similarity(embedding.reshape(1, -1), category_embeddings).flatten()
    max_sim_index = similarities.argmax()
    max_sim_value = similarities.max()
    if max_sim_value >= threshold:
        return categories[max_sim_index]
    else:
        return ""

In [20]:
# Load the dataset
file_path = './data.csv'
df = pd.read_csv(file_path)

# Drop unnecessary columns and preprocess the data
df = drop_columns(df)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [21]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [22]:
category_embeddings = get_embeddings(categories, model, tokenizer)

In [23]:
# Create a DataFrame for the categories
categories_df = pd.DataFrame(categories, columns=['category'])
categories_df['clean_category'] = categories_df['category'].apply(lambda x: x.lower())

# Combine names and categories for TF-IDF vectorization
combined_texts = train_df['clean_name'].tolist() + categories_df['clean_category'].tolist()

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

# Split the TF-IDF matrix
names_tfidf = tfidf_matrix[:len(train_df)]
categories_tfidf = tfidf_matrix[len(train_df):]

# Calculate cosine similarity
similarity_matrix = cosine_similarity(names_tfidf, categories_tfidf)

In [24]:
# Assign categories based on TF-IDF similarity
def assign_category_tfidf(similarity_row, threshold=0.8):
    max_sim_index = similarity_row.argmax()
    max_sim_value = similarity_row.max()
    if max_sim_value >= threshold:
        return categories[max_sim_index]
    else:
        return None

In [25]:
train_df['category_tfidf'] = [assign_category_tfidf(row) for row in similarity_matrix]

In [26]:
# For names that don't meet the TF-IDF threshold, use BERT embeddings
def categorize_with_fallback(df):
    # Get indices of rows where category_tfidf is None
    fallback_indices = df[df['category_tfidf'].isnull()].index
    fallback_texts = df.loc[fallback_indices, 'clean_name'].tolist()

    # Get embeddings for fallback texts
    fallback_embeddings = get_embeddings(fallback_texts, model, tokenizer)
    
    # Assign categories based on BERT embeddings
    fallback_categories = [assign_category(emb, category_embeddings) for emb in fallback_embeddings]
    
    # Update the dataframe with the fallback categories
    df.loc[fallback_indices, 'category'] = fallback_categories
    
    # Fill remaining categories with the ones assigned by TF-IDF
    df['category'].fillna(df['category_tfidf'], inplace=True)
    
    return df


In [27]:

# Apply fallback categorization to both training and test sets
train_df = categorize_with_fallback(train_df)
#test_df = categorize_with_fallback(test_df)






In [32]:
train_df[['name', 'category']].sample(10)

Unnamed: 0,name,category
2748,Secure Parking Souk Madinat Jumeirah,Movie Theater
7337,Go Blue Tours,Restaurant
4734,Emarat- Ameen,Mosque
13933,SRE,Doctor
4557,City Hall,
13529,Players Cars,School
12114,Hor al anz 4,Mosque
11453,Goodhome Appliances,Moving Company
5802,BISH Super Market,Moving Company
1251,Kashaf Pharmacy LLC,Pharmacy
