In [2]:
import requests
from pprint import pprint
import pandas as pd
from timezonefinder import TimezoneFinder
from bs4 import BeautifulSoup
# Replace these with your own API keys
GEONAMES_USERNAME = "phuree"

def get_gdp_per_capita(country_code):
    """Fetch GDP per capita for a country from World Bank."""
    url = f"http://api.worldbank.org/v2/country/{country_code}/indicator/NY.GDP.PCAP.CD?format=json"
    response = requests.get(url)
    if response.ok and len(response.json()) > 1:
        data = response.json()[1]
        return data[0]['value'] if data else None
    return None

def get_time_zone(lat, lng):
    try:
        tf = TimezoneFinder()
        return tf.timezone_at(lng=lng, lat=lat)
    except Exception as e:
        print(f"Error fetching time zone data: {e}")

def get_population_and_country_code(city, country):
    """Fetch latitude and longitude of a city using GeoNames."""
    url = f"http://api.geonames.org/searchJSON?name={city}&maxRows=1&username={GEONAMES_USERNAME}"
    response = requests.get(url)
    if response.ok:
        data = response.json()
        # print(data)
        if data['geonames']:
            return data['geonames'][0]['population'],data['geonames'][0]['countryCode']
    return None,None

def get_safety_index(city):
    """Scrape safety index from Numbeo website."""
    city_query = city.replace(" ", "-")
    url = f"https://www.numbeo.com/crime/in/{city_query}"
    response = requests.get(url)

    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the safety index value in the page
        table = soup.find("table", {"class": "table_indices"})
        if table:
            safety_index = table.find_all("tr")[2].find_all("td")[1].text.strip()
            return safety_index
    return None

# df = pd.read_csv('../CSVs/city.csv')
# cnt = 0
# for index, row in df.iterrows():
#     if cnt == 6: break
#     cnt+=1
#     city = row['city']
#     country = row['country']
#     city_id = row['city_id']
#     lat = row['lat']
#     lng = row['lon']
#     population = get_population(city, country)
#     gdp_per_capita = get_gdp_per_capita(country)
#     time_zone = get_time_zone(lat, lng) if lat and lng else None
#     safety_index = get_safety_index(city)

#     print(f"City: {city}")
#     print(f"Country: {country}")
#     print(f"Population: {population}")
#     print(f"GDP per Capita: {gdp_per_capita}")
#     print(f"Latitude: {lat}")
#     print(f"Longitude: {lng}")
#     print(f"Time Zone: {time_zone}")
#     print(f"Safety Index: {safety_index}")
#     print()



In [8]:
# Dictionary to cache safety index for each country
country_cache = {}

def get_safety_index(country):
    """Scrape safety index from Numbeo website."""
    if country in country_cache:
        return country_cache[country]
    
    url = f"https://www.numbeo.com/crime/country_result.jsp?country={country}"
    response = requests.get(url)

    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the safety index value in the page
        table = soup.find("table", {"class": "table_indices"})
        if table:
            safety_index = table.find_all("tr")[2].find_all("td")[1].text.strip()
            country_cache[country] = safety_index
            return safety_index
    return None

In [9]:
df = pd.read_csv('../CSVs/last_city_with_primary_language.csv')


def add_new_cols(row):
    country = row['country']
    safety_index = get_safety_index(country)
    print(f"Country: {country} - Safety Index: {safety_index}")
    return pd.Series([safety_index])

df[['safety_index']] = df.apply(add_new_cols, axis=1)

df.to_csv('../CSVs/city_complete', index=False)


Country: Thailand - Safety Index: 62.54
Country: Thailand - Safety Index: 62.54
Country: Japan - Safety Index: 77.10
Country: United States - Safety Index: 50.74
Country: United States - Safety Index: 50.74
Country: United States - Safety Index: 50.74
Country: United States - Safety Index: 50.74
Country: Norway - Safety Index: 67.01
Country: Czech Republic - Safety Index: 73.61
Country: Thailand - Safety Index: 62.54
Country: Thailand - Safety Index: 62.54
Country: United Kingdom - Safety Index: 51.83
Country: China - Safety Index: 75.91
Country: Netherlands - Safety Index: 73.33
Country: Taiwan - Safety Index: 82.92
Country: Australia - Safety Index: 52.73
Country: Indonesia - Safety Index: 54.04
Country: New Zealand - Safety Index: 51.86
Country: China - Safety Index: 75.91
Country: Japan - Safety Index: 77.10
Country: Netherlands - Safety Index: 73.33
Country: United Kingdom - Safety Index: 51.83
Country: Indonesia - Safety Index: 54.04
Country: Taiwan - Safety Index: 82.92
Country:

In [None]:
# For recovering all data that we print before error occured, the format is like this
# print(f"City: {city} - Population: {population} - GDP per Capita: {gdp_per_capita} - Time Zone: {time_zone} - Safety Index: {safety_index}")
import csv

input_file = '../CSVs/x.csv'
output_file = '../CSVs/x_converted.csv'

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
  reader = infile.readlines()
  writer = csv.writer(outfile)
  
  # Write the header
  writer.writerow(['city','population', 'gdp_per_capita', 'time_zone', 'safety_index'])
  
  for line in reader:
    parts = line.strip().split(' - ')
    city = parts[0].split(': ')[1]
    population = parts[1].split(': ')[1]
    gdp_per_capita = parts[2].split(': ')[1]
    time_zone = parts[3].split(': ')[1]
    safety_index = parts[4].split(': ')[1]
    
    writer.writerow([city, population, gdp_per_capita, time_zone, safety_index])


In [None]:
# For merging the data that we recover before error occured
import pandas as pd

# Read the CSV files
city_df = pd.read_csv('../CSVs/city.csv')
x_converted_df = pd.read_csv('../CSVs/x_converted.csv')

# Merge the dataframes on the 'city' column, keeping only the necessary columns from x_converted_df
merged_df = city_df.merge(x_converted_df[['city', 'population', 'gdp_per_capita', 'time_zone', 'safety_index']], on='city', how='left')

# Save the merged dataframe to a new CSV file
merged_df.to_csv('../CSVs/city_converted_2.csv', index=False)


In [None]:
import requests
import pandas as pd
from timezonefinder import TimezoneFinder
from bs4 import BeautifulSoup
# Replace these with your own API keys
GEONAMES_USERNAME = "phuree"

def get_gdp_per_capita(country_code):
    """Fetch GDP per capita for a country from World Bank."""
    url = f"http://api.worldbank.org/v2/country/{country_code}/indicator/NY.GDP.PCAP.CD?format=json"
    response = requests.get(url)
    if response.ok and len(response.json()) > 1:
        data = response.json()[1]
        return data[0]['value'] if data else None
    return None

def get_time_zone(lat, lng):
    try:
        tf = TimezoneFinder()
        return tf.timezone_at(lng=lng, lat=lat)
    except Exception as e:
        print(f"Error fetching time zone data: {e}")

def get_population_and_country_code(city, country):
    """Fetch latitude and longitude of a city using GeoNames."""
    url = f"http://api.geonames.org/searchJSON?name={city}&maxRows=1&username={GEONAMES_USERNAME}"
    response = requests.get(url)
    if response.ok:
        data = response.json()
        # print(data)
        if data.get('geonames',None):
            return data['geonames'][0].get('population'), data['geonames'][0].get('countryCode', None)
    return None,None

# Dictionary to cache safety index for each country
country_cache = {}

def get_safety_index(country):
    """Scrape safety index from Numbeo website."""
    if country in country_cache:
        return country_cache[country]
    
    country_query = country.replace(" ", "-")
    url = f"https://www.numbeo.com/crime/in/{country_query}"
    response = requests.get(url)

    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the safety index value in the page
        table = soup.find("table", {"class": "table_indices"})
        if table:
            safety_index = table.find_all("tr")[2].find_all("td")[1].text.strip()
            country_cache[country] = safety_index
            return safety_index
    return None

df = pd.read_csv('../CSVs/city_converted.csv')

def add_new_cols(row):
    time_zone = row['time_zone']
    if time_zone=='nan':
        return pd.Series([row['population'], row['gdp_per_capita'], time_zone, row['safety_index']])
    city = row['city']
    country = row['country']
    city_id = row['city_id']
    lat = row['lat']
    lng = row['lon']
    population,country_code = get_population_and_country_code(city, country)
    gdp_per_capita = get_gdp_per_capita(country_code)
    time_zone = get_time_zone(lat, lng) if lat and lng else None
    safety_index = get_safety_index(country)
    #! Important for repair data so we can save the data that we already have before error occured 
    print(f"City: {city} - Population: {population} - GDP per Capita: {gdp_per_capita} - Time Zone: {time_zone} - Safety Index: {safety_index}")
    return pd.Series([population, gdp_per_capita, time_zone, safety_index])

df[['population', 'gdp_per_capita', 'time_zone', 'safety_index']] = df.apply(add_new_cols, axis=1)

df.to_csv('../CSVs/city_with_new_cols.csv', index=False)



In [16]:
df = pd.read_csv('../CSVs/last_city.csv')

# Dictionary to cache primary language for each country
language_cache = {}

def add_primary_language(row):
    country = row['country']
    if country in language_cache:
        primary_language = language_cache[country]
    else:
        url = f"https://restcountries.com/v3.1/name/{country}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data:
                languages = data[0].get("languages", {})
                primary_language = list(languages.values())[0] if languages else None
                language_cache[country] = primary_language
                return primary_language
        primary_language = None

    print(f"{country}: Primary Language: {primary_language}")
    return primary_language

df['primary_language'] = df.apply(add_primary_language, axis=1)
df.to_csv('../CSVs/last_city_with_primary_language.csv', index=False)

Thailand: Primary Language: Thai
United States: Primary Language: English
United States: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
Thailand: Primary Language: Thai
China: Primary Language: Chinese
Japan: Primary Language: Japanese
Netherlands: Primary Language: Dutch
United Kingdom: Primary Language: English
Indonesia: Primary Language: Indonesian
Taiwan: Primary Language: Chinese
Australia: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
Thailand: Primary Language: Thai
Australia: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
United Kingdom: Primary Language: English
China: Primary Language: Chinese
United States: Primary Language: English
United States: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
China: Primary Language: Chinese
China: Primary Language: Chinese
China:

In [None]:
import requests
import pandas as pd
import pycountry
from bs4 import BeautifulSoup

# Dictionary to cache primary language for each country
country_code_cache = {}

def get_country_code(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_2
    except LookupError:
        return None
    
def get_population(row):
    """Fetch latitude and longitude of a city using GeoNames."""
    city = row['city']
    country = row['country']
    country_code = None
    if country in country_code_cache:
        country_code = country_code_cache[country]
    else:
        country_code = get_country_code(country)
        country_code_cache[country] = country_code
    # print(country_code)
    if not country_code: return None
    url = f"http://api.geonames.org/searchJSON?name={city}&country={country_code}&maxRows=1&username=phuree"
    response = requests.get(url)
    if response.ok:
        data = response.json()
        if data.get('geonames',None):
            print(f"{city} - {country_code} - Population: {data['geonames'][0].get('population',None)}")
            return data['geonames'][0].get('population',None)
    return None

# Update only rows where population is 0 or NaN
# df.loc[df['population'].isna() | (df['population'] == 0), 'population'] = df.apply(
#     lambda row: get_population(row) if pd.isna(row['population']) or row['population'] == 0 else row['population'], axis=1
# )

# # df.to_csv('../archive/city_impute_pop.csv', index=False)
# df2 = pd.read_csv('../archive/city_impute_pop.csv')

# # Impute mean population for each country
# df2['population'] = df2.groupby('country')['population'].transform(lambda x: x.replace(0, x.mean()).fillna(x.mean(numeric_only=True)))

# # Check if there are still any rows with population 0
# df2.to_csv('../archive/city_impute_pop_and_mean.csv', index=False)

# df = pd.read_csv('../archive/city_impute_pop_real.csv')
# print(df[df['population'].isna() | (df['population'] == 0)].shape[0])

def get_population_from_web(city:str, country:str):
    """Scrape population from a website."""
    print(f"Fetching population for {city}, {country}")

    url = f"https://www.worldpopulationreview.com/cities/{country.lower()}"
    response = requests.get(url)
    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')

        table = soup.find("tbody")
        city_query = ' '.join([x.capitalize() for x in city.split(' ')])
        country_tag = table.find("th", string=city_query) 
        if not country_tag:
            return None
        population_tag = country_tag.next_sibling
        if population_tag:
            population = population_tag.text.strip().replace(",", "")
            print(f"City:{city} Population: {population}")
            return int(population)
    return None

# Update only rows where population is 0 or NaN using web scraping
# df.loc[df['population'].isna() | (df['population'] == 0), 'population'] = df.apply(
#     lambda row: get_population_from_web(row['city'], row['country']) if pd.isna(row['population']) or row['population'] == 0 else row['population'], axis=1
# )

# df.to_csv('../archive/city_impute_pop_real.csv', index=False)

339
