In [1]:
import re

import pandas as pd

import requests
from bs4 import BeautifulSoup

from tqdm.auto import tqdm
from retrying import retry

### DATA SOURCE: https://www.ktb.gov.tr/EN-249300/monthly-bulletins.html

In [2]:
urls = [
    "https://www.ktb.gov.tr/EN-249306/2019.html",
    "https://www.ktb.gov.tr/EN-256541/2020.html",
    "https://www.ktb.gov.tr/EN-283399/2021.html",
    "https://www.ktb.gov.tr/EN-310546/2022.html",
    "https://www.ktb.gov.tr/EN-338400/2023.html"
]

In [3]:
def get_url_text_dict(elem):

    url = "https://www.ktb.gov.tr" + elem["href"]
    text = re.search("citizens,* *(.+)$", elem.text.strip(), re.IGNORECASE).group(1)

    return {url : text}

In [4]:
def parse_filenames(url):
    r = requests.get(url)
    soup=BeautifulSoup(r.text, "lxml")

    all_files = soup.select("div.col-md-11")[0].find_all("a")
    
    month_files=[get_url_text_dict(elem=file) for file in all_files if file.text!=""]

    return month_files

In [5]:
@retry(stop_max_attempt_number=3, wait_fixed=2000)
def get_data(key, val):

    print("Extract data from", val)

    month_dict = {
        'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7,
        'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
        }

    raw_df = pd.read_excel(key, sheet_name='Provinces & Transport', skiprows=2)
    
    if any(raw_df["PROVINCES"].eq('İstanbul')):
        df = raw_df[raw_df["PROVINCES"].eq('İstanbul')].reset_index(drop=True)
        df["month"]=month_dict[val.split()[0].lower()]
        df["year"]=val.split()[1]

        return df
    
    else:
        print("Issue with", val)

In [6]:
def merge_files(filenames):
    df = pd.concat((get_data(key=k, val=v) for file in tqdm(filenames) for k, v in file.items()), ignore_index=True)
    return df

# Extracting Tourist Arrival Data

In [7]:
tourist_data=[]

for url in urls:
    filenames=parse_filenames(url=url)
    merged_df=merge_files(filenames=filenames)
    tourist_data.append(merged_df)


  0%|          | 0/12 [00:00<?, ?it/s]

Extract data from January 2019
Extract data from February 2019
Extract data from March 2019
Extract data from April 2019
Extract data from May 2019
Extract data from June 2019
Extract data from July 2019
Extract data from July 2019
Extract data from August 2019
Extract data from September 2019
Extract data from October 2019
Extract data from November 2019
Extract data from December 2019


  0%|          | 0/12 [00:00<?, ?it/s]

Extract data from January 2020
Extract data from February 2020
Extract data from March 2020
Extract data from April 2020
Extract data from May 2020
Extract data from June 2020
Extract data from july 2020
Extract data from August 2020
Extract data from September 2020
Extract data from October 2020
Extract data from November 2020
Extract data from December 2020


  0%|          | 0/12 [00:00<?, ?it/s]

Extract data from January 2021
Extract data from February 2021
Extract data from March 2021
Extract data from April 2021
Extract data from May 2021
Extract data from June 2021
Extract data from July 2021
Extract data from August 2021
Extract data from September 2021
Extract data from October 2021
Extract data from November 2021
Extract data from November 2021
Extract data from December 2021


  0%|          | 0/12 [00:00<?, ?it/s]

Extract data from January 2022
Extract data from February 2022
Extract data from March 2022
Extract data from April 2022
Extract data from May 2022
Extract data from June 2022
Extract data from July 2022
Extract data from August 2022
Extract data from August 2022
Extract data from September 2022
Extract data from October 2022
Extract data from November 2022
Extract data from December 2022


  0%|          | 0/4 [00:00<?, ?it/s]

Extract data from January 2023
Extract data from February 2023
Extract data from March 2023
Extract data from April 2023


In [10]:
df = pd.concat(tourist_data, ignore_index=True)
df.to_csv("tourist_monthly_arrivals.csv", index=False)

# Citizen Travel Data

In [18]:
@retry(stop_max_attempt_number=3, wait_fixed=2000)
def wrangle_data(key, val, sheet_name):

    # Read excel File
    df = pd.read_excel(key, sheet_name=sheet_name, skiprows=2)

    # Rename Columns
    df = df.rename(columns={"Unnamed: 0" : "PROVINCES", "Unnamed: 1" : "BORDER GATES"})

    df["PROVINCES"] = df["PROVINCES"].ffill()
    ist_df = df[df["PROVINCES"].eq("İstanbul")]

    try:
        ist_df = ist_df.drop(["PROVINCES", "Unnamed: 2", "Unnamed: 15"], axis=1)

    except KeyError:
        ist_df = ist_df.drop(["PROVINCES", "Unnamed: 2", "Unnamed: 7"], axis=1)

    ist_df = ist_df[ist_df["BORDER GATES"].ne("TOTAL")]
    ist_df=ist_df.melt(id_vars="BORDER GATES", var_name="month",value_name="number_of_people")
    ist_df["month"] = ist_df["month"].str.lower()
    ist_df["year"] = int(val)
    ist_df["citizen_travel_type"] = sheet_name
    
    return ist_df

In [21]:
citizen_data=[]

for url in tqdm(urls):
    filenames=parse_filenames(url=url)

    for k, v in filenames[-1].items():
        for name in ["Returning", "Travelling Abroad"]:
            df = wrangle_data(key=k, val=int(v.split()[1]), sheet_name=name)
            citizen_data.append(df)

  0%|          | 0/5 [00:00<?, ?it/s]

In [23]:
citizen_travel_df = pd.concat(citizen_data, ignore_index=True)

In [25]:
citizen_travel_df.to_csv("citizen_travel_df.csv", index=False)