In [5]:
import requests
from pprint import pprint
import pandas as pd
from timezonefinder import TimezoneFinder
from bs4 import BeautifulSoup
# Replace these with your own API keys
GEONAMES_USERNAME = "phuree"

def get_gdp_per_capita(country_code):
    """Fetch GDP per capita for a country from World Bank."""
    url = f"http://api.worldbank.org/v2/country/{country_code}/indicator/NY.GDP.PCAP.CD?format=json"
    response = requests.get(url)
    if response.ok and len(response.json()) > 1:
        data = response.json()[1]
        return data[0]['value'] if data else None
    return None

def get_time_zone(lat, lng):
    try:
        tf = TimezoneFinder()
        return tf.timezone_at(lng=lng, lat=lat)
    except Exception as e:
        print(f"Error fetching time zone data: {e}")

def get_population_and_country_code(city, country):
    """Fetch latitude and longitude of a city using GeoNames."""
    url = f"http://api.geonames.org/searchJSON?name={city}&maxRows=1&username={GEONAMES_USERNAME}"
    response = requests.get(url)
    if response.ok:
        data = response.json()
        # print(data)
        if data['geonames']:
            return data['geonames'][0]['population'],data['geonames'][0]['countryCode']
    return None,None

def get_safety_index(city):
    """Scrape safety index from Numbeo website."""
    city_query = city.replace(" ", "-")
    url = f"https://www.numbeo.com/crime/in/{city_query}"
    response = requests.get(url)

    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the safety index value in the page
        table = soup.find("table", {"class": "table_indices"})
        if table:
            safety_index = table.find_all("tr")[2].find_all("td")[1].text.strip()
            return safety_index
    return None

# df = pd.read_csv('../CSVs/city.csv')
# cnt = 0
# for index, row in df.iterrows():
#     if cnt == 6: break
#     cnt+=1
#     city = row['city']
#     country = row['country']
#     city_id = row['city_id']
#     lat = row['lat']
#     lng = row['lon']
#     population = get_population(city, country)
#     gdp_per_capita = get_gdp_per_capita(country)
#     time_zone = get_time_zone(lat, lng) if lat and lng else None
#     safety_index = get_safety_index(city)

#     print(f"City: {city}")
#     print(f"Country: {country}")
#     print(f"Population: {population}")
#     print(f"GDP per Capita: {gdp_per_capita}")
#     print(f"Latitude: {lat}")
#     print(f"Longitude: {lng}")
#     print(f"Time Zone: {time_zone}")
#     print(f"Safety Index: {safety_index}")
#     print()



In [6]:
df = pd.read_csv('../CSVs/city.csv')

def add_new_cols(row):
    city = row['city']
    country = row['country']
    city_id = row['city_id']
    lat = row['lat']
    lng = row['lon']
    population,country_code = get_population_and_country_code(city, country)
    gdp_per_capita = get_gdp_per_capita(country_code)
    time_zone = get_time_zone(lat, lng) if lat and lng else None
    safety_index = get_safety_index(city)
    print(f"City: {city} - Population: {population} - GDP per Capita: {gdp_per_capita} - Time Zone: {time_zone} - Safety Index: {safety_index}")
    return pd.Series([population, gdp_per_capita, time_zone, safety_index])

df[['population', 'gdp_per_capita', 'time_zone', 'safety_index']] = df.apply(add_new_cols, axis=1)

df.to_csv('../CSVs/city_with_new_cols.csv', index=False)


City: Bangkok - Population: 5104476 - GDP per Capita: 7171.80809318978 - Time Zone: Asia/Bangkok - Safety Index: 60.46
City: Nakhon Pathom - Population: 117927 - GDP per Capita: 7171.80809318978 - Time Zone: Asia/Bangkok - Safety Index: None
City: Suita - Population: 351630 - GDP per Capita: 33834.3921056453 - Time Zone: Asia/Tokyo - Safety Index: None
City: Rochester - Population: 30038 - GDP per Capita: 81695.1870713305 - Time Zone: America/New_York - Safety Index: 35.62
City: Riverside - Population: 4115871 - GDP per Capita: 81695.1870713305 - Time Zone: America/Los_Angeles - Safety Index: 59.76
City: San Diego - Population: 1394928 - GDP per Capita: 81695.1870713305 - Time Zone: America/Los_Angeles - Safety Index: 60.27
City: University Park - Population: 26995 - GDP per Capita: 81695.1870713305 - Time Zone: America/Chicago - Safety Index: None
City: Trondheim - Population: 212660 - GDP per Capita: 87961.7806135072 - Time Zone: Europe/Oslo - Safety Index: 79.64
City: Prague - Popul

Exception ignored in: <function AbstractTimezoneFinder.__del__ at 0x000001E8B3BC7C40>
Traceback (most recent call last):
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python312\Lib\site-packages\timezonefinder\timezonefinder.py", line 108, in __del__
    getattr(self, attribute_name).close()
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'TimezoneFinder' object has no attribute 'poly_zone_ids'


KeyError: 'countryCode'

In [None]:
# For recovering all data that we print before error occured, the format is like this
# print(f"City: {city} - Population: {population} - GDP per Capita: {gdp_per_capita} - Time Zone: {time_zone} - Safety Index: {safety_index}")
import csv

input_file = '../CSVs/x.csv'
output_file = '../CSVs/x_converted.csv'

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
  reader = infile.readlines()
  writer = csv.writer(outfile)
  
  # Write the header
  writer.writerow(['city','population', 'gdp_per_capita', 'time_zone', 'safety_index'])
  
  for line in reader:
    parts = line.strip().split(' - ')
    city = parts[0].split(': ')[1]
    population = parts[1].split(': ')[1]
    gdp_per_capita = parts[2].split(': ')[1]
    time_zone = parts[3].split(': ')[1]
    safety_index = parts[4].split(': ')[1]
    
    writer.writerow([city, population, gdp_per_capita, time_zone, safety_index])


In [None]:
# For merging the data that we recover before error occured
import pandas as pd

# Read the CSV files
city_df = pd.read_csv('../CSVs/city.csv')
x_converted_df = pd.read_csv('../CSVs/x_converted.csv')

# Merge the dataframes on the 'city' column, keeping only the necessary columns from x_converted_df
merged_df = city_df.merge(x_converted_df[['city', 'population', 'gdp_per_capita', 'time_zone', 'safety_index']], on='city', how='left')

# Save the merged dataframe to a new CSV file
merged_df.to_csv('../CSVs/city_converted_2.csv', index=False)


In [None]:
import requests
import pandas as pd
from timezonefinder import TimezoneFinder
from bs4 import BeautifulSoup
# Replace these with your own API keys
GEONAMES_USERNAME = "phuree"

def get_gdp_per_capita(country_code):
    """Fetch GDP per capita for a country from World Bank."""
    url = f"http://api.worldbank.org/v2/country/{country_code}/indicator/NY.GDP.PCAP.CD?format=json"
    response = requests.get(url)
    if response.ok and len(response.json()) > 1:
        data = response.json()[1]
        return data[0]['value'] if data else None
    return None

def get_time_zone(lat, lng):
    try:
        tf = TimezoneFinder()
        return tf.timezone_at(lng=lng, lat=lat)
    except Exception as e:
        print(f"Error fetching time zone data: {e}")

def get_population_and_country_code(city, country):
    """Fetch latitude and longitude of a city using GeoNames."""
    url = f"http://api.geonames.org/searchJSON?name={city}&maxRows=1&username={GEONAMES_USERNAME}"
    response = requests.get(url)
    if response.ok:
        data = response.json()
        # print(data)
        if data.get('geonames',None):
            return data['geonames'][0].get('population'), data['geonames'][0].get('countryCode', None)
    return None,None

# Dictionary to cache safety index for each country
country_cache = {}

def get_safety_index(country):
    """Scrape safety index from Numbeo website."""
    if country in country_cache:
        return country_cache[country]
    
    country_query = country.replace(" ", "-")
    url = f"https://www.numbeo.com/crime/in/{country_query}"
    response = requests.get(url)

    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the safety index value in the page
        table = soup.find("table", {"class": "table_indices"})
        if table:
            safety_index = table.find_all("tr")[2].find_all("td")[1].text.strip()
            country_cache[country] = safety_index
            return safety_index
    return None

df = pd.read_csv('../CSVs/city_converted.csv')

def add_new_cols(row):
    time_zone = row['time_zone']
    if time_zone=='nan':
        return pd.Series([row['population'], row['gdp_per_capita'], time_zone, row['safety_index']])
    city = row['city']
    country = row['country']
    city_id = row['city_id']
    lat = row['lat']
    lng = row['lon']
    population,country_code = get_population_and_country_code(city, country)
    gdp_per_capita = get_gdp_per_capita(country_code)
    time_zone = get_time_zone(lat, lng) if lat and lng else None
    safety_index = get_safety_index(country)
    #! Important for repair data so we can save the data that we already have before error occured 
    print(f"City: {city} - Population: {population} - GDP per Capita: {gdp_per_capita} - Time Zone: {time_zone} - Safety Index: {safety_index}")
    return pd.Series([population, gdp_per_capita, time_zone, safety_index])

df[['population', 'gdp_per_capita', 'time_zone', 'safety_index']] = df.apply(add_new_cols, axis=1)

df.to_csv('../CSVs/city_with_new_cols.csv', index=False)



In [16]:
df = pd.read_csv('../CSVs/last_city.csv')

# Dictionary to cache primary language for each country
language_cache = {}

def add_primary_language(row):
    country = row['country']
    if country in language_cache:
        primary_language = language_cache[country]
    else:
        url = f"https://restcountries.com/v3.1/name/{country}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data:
                languages = data[0].get("languages", {})
                primary_language = list(languages.values())[0] if languages else None
                language_cache[country] = primary_language
                return primary_language
        primary_language = None

    print(f"{country}: Primary Language: {primary_language}")
    return primary_language

df['primary_language'] = df.apply(add_primary_language, axis=1)
df.to_csv('../CSVs/last_city_with_primary_language.csv', index=False)

Thailand: Primary Language: Thai
United States: Primary Language: English
United States: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
Thailand: Primary Language: Thai
China: Primary Language: Chinese
Japan: Primary Language: Japanese
Netherlands: Primary Language: Dutch
United Kingdom: Primary Language: English
Indonesia: Primary Language: Indonesian
Taiwan: Primary Language: Chinese
Australia: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
Thailand: Primary Language: Thai
Australia: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
United Kingdom: Primary Language: English
China: Primary Language: Chinese
United States: Primary Language: English
United States: Primary Language: English
United States: Primary Language: English
Thailand: Primary Language: Thai
China: Primary Language: Chinese
China: Primary Language: Chinese
China:

In [1]:
import pandas as pd
df = pd.read_csv('../CSVs/last_city_with_primary_language.csv')
print(df[df['population']==0])

      city_id               city           country  citation_sum  p_count  \
22         23    Central Jakarta         Indonesia          3228       79   
77         78       College Park     United States         19135      586   
90         91            Tubarao            Brazil           234        8   
100       101         Chaing Mai          Thailand          6031      704   
108       109         Strasbourg            France         18172      546   
...       ...                ...               ...           ...      ...   
3262     3263            Sunspot     United States            16        1   
3263     3264     Mount Hamilton     United States            16        1   
3283     3284  Noordwijk aan Zee       Netherlands           121        1   
3284     3285         Thanh Xuan          Viet Nam            30        1   
3289     3290            Waigani  Papua New Guinea             0        1   

            lat         lon  population  gdp_per_capita             time_zo

In [None]:
import requests

def get_population(country_name):
    url = f"https://restcountries.com/v3.1/name/{country_name}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data:
            population = data[0].get("population", "Unknown")
            return population
    return None

country = "Thailand"
print(f"The population of {country} is {get_population(country)}.")
