In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("startup_cities.csv")

# Data Cleaning
df = df.dropna(subset=['city', 'quantity score', 'quality score', 'business score'])


# Select relevant fields/attributes
columns_to_use = ["quantity score", "quality score", "business score"]
df_selected = df[["city", "country"] + columns_to_use].copy()  # Create an explicit copy

# Scale the scores using StandardScaler
scaler = StandardScaler()
df_selected.loc[:, columns_to_use] = scaler.fit_transform(df_selected[columns_to_use]) #use loc to avoid the warning.

# Compute similarity matrix
similarity_matrix = cosine_similarity(df_selected[columns_to_use])

# Convert to DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df_selected["city"], columns=df_selected["city"])

# Define query cities
query_cities = ['San Francisco', 'Tokyo', 'Seoul']

# Find top 10 most similar cities for each query
for city in query_cities:
    print(f"\nTop 10 similar cities to {city}:")
    
    similar_cities = similarity_df[city].sort_values(ascending=False)[1:11]
    
    for similar_city_name, similarity_score in similar_cities.items():
        country = df_selected[df_selected["city"] == similar_city_name]["country"].values[0]
        print(f"{similar_city_name:<25} {country:<20} {similarity_score:.4f}")



Top 10 similar cities to San Francisco:
Beijing                    China               0.9873
New York                   United States       0.9870
Shanghai                   China               0.9691
Boston Area                United States       0.9218
Shenzhen                   China               0.8843
Los Angeles Area           United States       0.8801
Jakarta                    Indonesia           0.8782
Bangalore                  India               0.8550
Tel Aviv Area              Israel              0.8415
Sao Paulo                  Brazil              0.8373

Top 10 similar cities to Tokyo:
Seattle                    United States       0.9935
Austin                     United States       0.9795
San Diego                  United States       0.9790
Seoul                      South Korea         0.9777
Berlin                     Germany             0.9734
Washington DC Area         United States       0.9703
Shenzhen                   China               0.9692
Amsterda