In [82]:
import requests, re, pandas as pd
from bs4 import BeautifulSoup

In [1]:
def get_statistical_themes():
    """
    Get Statistical Themes and IDs from TUIK
    
    This function retrieves the statistical themes and their corresponding IDs
    from the TUIK website. Each theme is associated with an ID, which can 
    be used to access specific datasets within that theme.
    
    :return: A list of tuples containing theme names and their corresponding IDs.
             Returns None if the request to the website fails.
    """
    url = "https://data.tuik.gov.tr/"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve data: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    themes = soup.find_all("div", class_="text-center")
    
    theme_names = []
    theme_ids = []
    
    for theme in themes:
        anchor = theme.find("a")
        if anchor:
            theme_name = anchor.text.strip()
            theme_id = anchor['href'].split('/')[-1]
            
            theme_names.append(theme_name)
            theme_ids.append(theme_id)

    return list(zip(theme_names, theme_ids))

# Test the function
themes = get_statistical_themes()
if themes:
    for theme_name, theme_id in themes:
        print(f"Theme Name: {theme_name}, Theme ID: {theme_id}")

Theme Name: Adalet ve Seçim, Theme ID: GetKategori?p=Adalet-ve-Secim-110
Theme Name: Adalet ve Seçim, Theme ID: GetKategori?p=Adalet-ve-Secim-110
Theme Name: Bilim, Teknoloji ve Bilgi Toplumu, Theme ID: GetKategori?p=Bilim,-Teknoloji-ve-Bilgi-Toplumu-102
Theme Name: Çevre ve Enerji, Theme ID: GetKategori?p=Cevre-ve-Enerji-103
Theme Name: Dış Ticaret, Theme ID: GetKategori?p=Dis-Ticaret-104
Theme Name: Eğitim, Kültür, Spor ve Turizm, Theme ID: GetKategori?p=Egitim,-Kultur,-Spor-ve-Turizm-105
Theme Name: Ekonomik Güven, Theme ID: GetKategori?p=Ekonomik-Guven-117
Theme Name: Enflasyon ve Fiyat, Theme ID: GetKategori?p=Enflasyon-ve-Fiyat-106
Theme Name: Gelir, Yaşam, Tüketim ve Yoksulluk, Theme ID: GetKategori?p=Gelir,-Yasam,-Tuketim-ve-Yoksulluk-107
Theme Name: İnşaat ve Konut, Theme ID: GetKategori?p=Insaat-ve-Konut-116
Theme Name: İstihdam, İşsizlik ve Ücret, Theme ID: GetKategori?p=Istihdam,-Issizlik-ve-Ucret-108
Theme Name: Nüfus ve Demografi, Theme ID: GetKategori?p=Nufus-ve-Demograf

1. We import the requests library to make a HTTP request to the TUIK website, and BeautifulSoup from the bs4 library to parse the HTML content.
2. We define a function get_statistical_themes() that retrieves and parses the HTML content from the TUIK website.
3. We find all div elements with the class text-center, which contain the statistical themes and their URLs.
4. We iterate through these elements, extract the theme names and IDs, and store them in separate lists.
5. We return a list of tuples containing the theme names and IDs.

In [85]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_statistical_tables(theme_id):
    """
    Get the list of all statistical tables for a given theme from TUIK
    
    :param theme_id: ID of the data theme
    :return: A pandas DataFrame containing table names, data dates, and data file URLs.
    """
    url = 'https://data.tuik.gov.tr/Kategori/GetIstatistikselTablolar'
    payload = f'UstId={theme_id}&DilId=2&Page=1&Count=10000&Arsiv=false'
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}

    response = requests.post(url, data=payload, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve data: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    return soup
    
    # Extract table data
    tables = soup.find_all("table")
    if not tables:
        print("No tables found.")
        return None
    
    # Convert the first found table to a pandas DataFrame
    df = pd.read_html(str(tables[0]))[0]
    
    # Clean the DataFrame
    df = df.drop(columns=[2, 3]).dropna().reset_index(drop=True)
    df.columns = ['data_name', 'data_date']
    
    # Extract URLs
    anchors = soup.find_all("a", href=True)
    urls = [a['href'] for a in anchors if 'Bulten' not in a['href']]
    urls = [f"http://data.tuik.gov.tr{url}" for url in urls]
    
    # Add URLs to the DataFrame
    df['datafile_url'] = urls
    
    return df

In [100]:
# Test the function with a specific theme ID
soup = get_statistical_tables(108)
tables = soup.find_all("table")
if tables is not None:
    print(tables)

[<table aria-describedby="istatistikselTable_info" class="display table-hover dataTable no-footer" id="istatistikselTable" role="grid" style="width: 100%; font-size: 1.3rem;">
<thead class="d-none">
<tr>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</thead>
<tbody>
<tr class="dtrg-group dtrg-start dtrg-level-0">
<td>Monthly Results</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="dtrg-group dtrg-start dtrg-level-1">
<td>Labor Force Status of the Population</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="odd" role="row" style="font-size: 1.2rem">
<td width="80%">
<svg class="tuik-icon mr-3 tuik-if-green" style="width: 18px; height: 18px">
<title>İstatistiksel Tablolar</title>
<use xlink:href="/img/SVG/symbol-defs.svg#tuik-icon-download_table"></use>
</svg>
<font color="red">Yeni</font>
                                Seasonally adjusted main labour force indicators (15+ age)
                            </td>
<td width="14%">
                                10 Ekim 2023
         

Parse table into dataframe

In [98]:
# Extract the first table into a list of DataFrame
df_tables = pd.read_html(str(tables[0]))

In [114]:
if tables:
    # Select the first table
    df = df_tables[0]

    # reset column names to index numbers
    df.columns = range(df.shape[1])
    
    # Select only the first two columns and remove rows where the second column is empty
    df = df.iloc[:, [0, 1]].dropna(subset=[1]).reset_index(drop=True)
    
    # Clean the first column by removing specific patterns of strings
    df[0] = df[0].apply(lambda x: re.sub(r'İstatistiksel Tablolar(Yeni)?\s*', '', x))

    # Remove `Yeni` from the first column
    df[0] = df[0].apply(lambda x: re.sub(r'Yeni\s*', '', x))
    
    print(df)

                                                     0               1
0    Seasonally adjusted main labour force indicato...    10 Ekim 2023
1    Seasonally adjusted main labour force indicato...    10 Ekim 2023
2               Main labour force indicators (15+ age)    10 Ekim 2023
3             Main labour force indicators (15-24 age)    10 Ekim 2023
4    Seasonally adjusted supplementary indicators f...    10 Ekim 2023
..                                                 ...             ...
219  Monthly average social security payments and c...  29 Aralık 2009
220  Monthly average social security payments and c...  29 Aralık 2009
221  Actual weekly working hours and monthly averag...  29 Aralık 2009
222  Monthly average labour cost and components by ...  29 Aralık 2009
223  Monthly average social security paymants and c...  29 Aralık 2009

[224 rows x 2 columns]


In [113]:
# Using BeautifulSoup to parse the HTML and extract the needed data
rows = soup.find_all('tr')

# Set base_url for the datafile URLs
base_url = "http://data.tuik.gov.tr"

data = []
for row in rows:
    # Extracting the data name
    data_name = row.find('svg').find_next_sibling(string=True).strip() if row.find('svg') else None
    
    # Extracting the data date
    data_date = row.find('td', width="14%").text.strip() if row.find('td', width="14%") else None
    
    # Extracting the first URL (Excel file download link)
    excel_link = row.find('img', src="/img/SVG/excel.svg")
    datafile_url = base_url + excel_link.find_parent('a')['href'] if excel_link else None
    
    if data_name and data_date and datafile_url:  # Ensuring all extracted data is not None before appending
        data.append([data_name, data_date, datafile_url])

# Creating a DataFrame from the extracted data
df = pd.DataFrame(data, columns=['data_name', 'data_date', 'datafile_url'])

# Displaying the DataFrame
df.head()

Unnamed: 0,data_name,data_date,datafile_url
0,Labour Cost Indices (2015=100),22 Ağustos 2023,http://data.tuik.gov.tr/Bulten/DownloadIstatis...
1,Labour Input Indices (2015=100),22 Ağustos 2023,http://data.tuik.gov.tr/Bulten/DownloadIstatis...
2,Paid Employee Statistics (Trade-Services),15 Ağustos 2023,http://data.tuik.gov.tr/Bulten/DownloadIstatis...
3,Paid Employee Statistics,15 Ağustos 2023,http://data.tuik.gov.tr/Bulten/DownloadIstatis...
4,Paid Employee Statistics (Industry),15 Ağustos 2023,http://data.tuik.gov.tr/Bulten/DownloadIstatis...


In [111]:
line = 5 

for row in rows:
    line -= 1
    if line < 0:
        break
    print(row, '\n')

<tr>
<td></td>
<td></td>
<td></td>
<td></td>
</tr> 

<tr class="dtrg-group dtrg-start dtrg-level-0">
<td>Monthly Results</td>
<td></td>
<td></td>
<td></td>
</tr> 

<tr class="dtrg-group dtrg-start dtrg-level-1">
<td>Labor Force Status of the Population</td>
<td></td>
<td></td>
<td></td>
</tr> 

<tr class="odd" role="row" style="font-size: 1.2rem">
<td width="80%">
<svg class="tuik-icon mr-3 tuik-if-green" style="width: 18px; height: 18px">
<title>İstatistiksel Tablolar</title>
<use xlink:href="/img/SVG/symbol-defs.svg#tuik-icon-download_table"></use>
</svg>
<font color="red">Yeni</font>
                                Seasonally adjusted main labour force indicators (15+ age)
                            </td>
<td width="14%">
                                10 Ekim 2023
                            </td>
<td width="3%">
<a data-placement="top" data-toggle="tooltip" href="/Bulten/DownloadIstatistikselTablo?p=MPF4QRhuYRKw75WOIeh3AQn2CB/BHQzhxkHVEPClX/GEUiogyIPe6u1dbXOA1OfT" title="İstatis

In [109]:
row.find('svg').find_next_sibling(string=True).strip() if row.find('svg') else None

'Monthly average social security paymants and components by size class of enterprise'

In [103]:
# get first item
df.iloc[0]['datafile_url']

'http://data.tuik.gov.tr/Bulten/DownloadIstatistikselTablo?p=IEq9LM7MfrAwOb5YcoB7UjFDHQd1PUwIiSU50kW6smxf3paFfW587jMnbQxawoTT'

---

Now we've got urls for each statistics table in a given theme. We can use these urls to download the tables and parse them into dataframes.

In [95]:
# get first item
df.iloc[0]['datafile_url']

# download linked Excel file and read into a DataFrame
df_excel = pd.read_excel(df.iloc[0]['datafile_url'])

In [96]:
df_excel

Unnamed: 0,"Tüketim harcaması türlerinin harcamaya göre sıralı %20'lik gruplara göre dağılımı, Türkiye, 2002-2022",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Distribution of quintiles ordered by expenditu...,,,,,,,(%)
1,,,,,,,,
2,,,,Harcama grubu - Quintiles ordered by expenditure,,,,
3,Anket yılı Survey year,Harcama türleri\nExpenditure types,,,,,,
4,,,Toplam Total,1. %20 \nFirst quintile,2. %20 Second quintile,3. %20 Third quintile,4. %20 Fourth quintile,5. %20 Last quintile
...,...,...,...,...,...,...,...,...
268,COICOP sınıflaması 2022 yılında hem ECOICOP he...,,,,,,,
269,(1) Yeni nüfus projeksiyonları 2007 yılından ...,,,,,,,
270,(2) Örneklem tasarımında 2014 yılından itibare...,,,,,,,
271,(3) ECOICOP sınıflaması kullanılmıştır - ECOIC...,,,,,,,


In [29]:
# Select the first table
df = tables[0]
    
# Select only the first two columns and remove rows where the second column is empty
df = df.iloc[:, [0, 1]].dropna(subset=[1]).reset_index(drop=True)

# Clean the first column by removing specific patterns of strings
df[0] = df[0].apply(lambda x: re.sub(r'İstatistiksel Tablolar(Yeni)?\s*', '', x))

TypeError: 'NoneType' object is not subscriptable

In [54]:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')



KeyError: [1]

In [19]:
# Extract table data
tables = soup.find_all("table")

In [23]:
pd.read_html(str(tables[0]))

ValueError: No tables found matching pattern '.+'

In [32]:
theme_id = 107
url = 'https://data.tuik.gov.tr/Kategori/GetIstatistikselTablolar'
payload = f'UstId={theme_id}&DilId=2&Page=1&Count=10000&Arsiv=false'
headers = {'Content-Type': 'application/x-www-form-urlencoded'}

response = requests.post(url, data=payload, headers=headers)

In [35]:
response.text
# save response.text to file as html
with open('response.txt', 'w') as f:
    f.write(response.text)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# save response.text to file as html


In [38]:
# Load the new HTML content from the provided .txt file
with open("response.txt", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the table
tables = pd.read_html(html_content)

# Check if we have any tables and display the first one if available
if tables:
    df = tables[0]
    display(df.head())
else:
    print("No tables found in the HTML content.")

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,-,,,
1,Household Consumption Expenditure Statistics (...,,,
2,İstatistiksel Tablolar Distribution of Quinti...,09 Haziran 2023,,
3,İstatistiksel Tablolar Distribution of Househ...,09 Haziran 2023,,
4,İstatistiksel Tablolar Distribution of Househ...,09 Haziran 2023,,


In [39]:
tables

[                                           Unnamed: 0       Unnamed: 1  \
 0                                                   -              NaN   
 1   Household Consumption Expenditure Statistics (...              NaN   
 2   İstatistiksel Tablolar  Distribution of Quinti...  09 Haziran 2023   
 3   İstatistiksel Tablolar  Distribution of Househ...  09 Haziran 2023   
 4   İstatistiksel Tablolar  Distribution of Househ...  09 Haziran 2023   
 ..                                                ...              ...   
 95  İstatistiksel Tablolar  The distribution of sp...     12 Ocak 2016   
 96  İstatistiksel Tablolar  The distribution of in...     12 Ocak 2016   
 97  İstatistiksel Tablolar  The distribution of pe...     12 Ocak 2016   
 98  İstatistiksel Tablolar  Yaşlı bakımı ve sıklığ...     12 Ocak 2016   
 99  İstatistiksel Tablolar  Average activity time ...     12 Ocak 2016   
 
     Unnamed: 2  Unnamed: 3  
 0          NaN         NaN  
 1          NaN         NaN  
 2      

In [42]:
# Clean the DataFrame by selecting and renaming the relevant columns
cleaned_df = df.iloc[:, [0, 1]].copy()
cleaned_df.columns = ['data_name', 'data_date']

# Clean the data_name column to remove unwanted characters or strings
cleaned_df['data_name'] = cleaned_df['data_name'].str.replace('İstatistiksel Tablolar', '', regex=False).str.strip()

# Display the cleaned DataFrame
cleaned_df.head()

Unnamed: 0,data_name,data_date
0,-,
1,Household Consumption Expenditure Statistics (...,
2,Distribution of Quintiles Ordered by Expenditu...,09 Haziran 2023
3,Distribution of Household Consumption Expendit...,09 Haziran 2023
4,Distribution of Household Consumption Expendit...,09 Haziran 2023


In [43]:
# Filter out rows where the data_date column is NaN, as these rows likely represent subcategories
cleaned_df = cleaned_df.dropna(subset=['data_date']).reset_index(drop=True)

# Clean the data_name column by removing unwanted patterns
cleaned_df['data_name'] = cleaned_df['data_name'].str.replace(r'\s*İstatistiksel Tablolar(Yeni)?\s*', '', regex=True)

In [None]:
# Select only the first two columns and remove rows where the second column is empty
df = df.iloc[:, [0, 1]].dropna(subset=[1]).reset_index(drop=True)

# Clean the first column by removing specific patterns of strings
df[0] = df[0].apply(lambda x: re.sub(r'İstatistiksel Tablolar(Yeni)?\s*', '', x))

In [44]:
cleaned_df

Unnamed: 0,data_name,data_date
0,Distribution of Quintiles Ordered by Expenditu...,09 Haziran 2023
1,Distribution of Household Consumption Expendit...,09 Haziran 2023
2,Distribution of Household Consumption Expendit...,09 Haziran 2023
3,Distribution of Household Consumption Expendit...,09 Haziran 2023
4,Distribution of Household Consumption Expendit...,09 Haziran 2023
...,...,...
74,The distribution of sports activities done or ...,12 Ocak 2016
75,The distribution of intensity status of daily ...,12 Ocak 2016
76,The distribution of persons who do not have en...,12 Ocak 2016
77,Yaşlı bakımı ve sıklığının cinsiyete göre dağı...,12 Ocak 2016


In [46]:
# Clean the DataFrame by selecting and renaming the relevant columns
cleaned_df = df.iloc[:, [0, 1]].copy()
cleaned_df.columns = ['data_name', 'data_date']

# Clean the data_name column to remove unwanted characters or strings
cleaned_df['data_name'] = cleaned_df['data_name'].str.replace('İstatistiksel Tablolar', '', regex=False).str.strip()

# Display the cleaned DataFrame
display(cleaned_df.head())

# Now let's extract the URLs from the anchor tags in the HTML content
anchors = soup.find_all('a', href=True)
urls = [a['href'] for a in anchors]

# Display the first few URLs to inspect their structure
urls[:5]

Unnamed: 0,data_name,data_date
0,-,
1,Household Consumption Expenditure Statistics (...,
2,Distribution of Quintiles Ordered by Expenditu...,09 Haziran 2023
3,Distribution of Household Consumption Expendit...,09 Haziran 2023
4,Distribution of Household Consumption Expendit...,09 Haziran 2023


['/Bulten/DownloadIstatistikselTablo?p=ioIn886qTBBxZ04d6ADzoen12l8Jgf/nwNiesH5bOD9SEjfjmnV1Px0pu7U4F1jk',
 '/Bulten/DownloadIstatistikselTablo?p=IZarm6K/Mu88EoNXhQvRr76vq0y3fJQkvlrAo6FTUoooc3T79jyr8KRZcruUU/cd',
 '/Bulten/DownloadIstatistikselTablo?p=259m50DUpYS9GZbtWZO86fOJT4VR/jHbS7tW7w4V/TMJeY91SYAxPvxDfXevHD3/',
 '/Bulten/DownloadIstatistikselTablo?p=D3mYcO6Ez3eUFdjjsJhh/a12JDbZKIqUrf6bithoKW8Ca5RAJpzsGc4LbkBAJGFg',
 '/Bulten/DownloadIstatistikselTablo?p=FM9xB8s/0kXi7bPMZO4aPj/8noaXlpCCOUq3Tlvn8F0zGJ55JXjB/KfR04wday0r']

In [48]:
len(urls)

157

In [50]:
base_url = "http://data.tuik.gov.tr"

In [51]:
# Find the container element that holds the table rows
table_container = soup.find('tbody')

# Extract each row within the table container
table_rows = table_container.find_all('tr') if table_container else []

# Extract URLs associated with each row
row_urls = []
for row in table_rows:
    anchor = row.find('a', href=True)
    if anchor:
        url = base_url + anchor['href']
        row_urls.append(url)
    else:
        # Add None for rows without URLs to keep the same length
        row_urls.append(None)

# Filter out None values to get the final list of URLs
row_urls = list(filter(None, row_urls))

# Add the URLs to the cleaned DataFrame
if len(row_urls) == len(cleaned_df):
    cleaned_df['datafile_url'] = row_urls
else:
    print("The number of URLs does not match the number of rows in the DataFrame.")

# Display the DataFrame with URLs
cleaned_df.head()

The number of URLs does not match the number of rows in the DataFrame.


Unnamed: 0,data_name,data_date
0,-,
1,Household Consumption Expenditure Statistics (...,
2,Distribution of Quintiles Ordered by Expenditu...,09 Haziran 2023
3,Distribution of Household Consumption Expendit...,09 Haziran 2023
4,Distribution of Household Consumption Expendit...,09 Haziran 2023


In [None]:
https://data.tuik.gov.tr/Bulten/DownloadIstatistikselTablo?p=GTTIUMe2Lb1RLKEwfjRO7VmkLw0rDKNSu78yHiR69E3dZDh1VFoYqk1oX6DutFqj
https://data.tuik.gov.tr/Bulten/DownloadIstatistikselTablo?p=rBRd5tG9udHZ/fZljmbMwm2FUt1Z9SYIp2qGISNdlWohuyuPWrh5ZVxXNoHOFnsl

In [53]:
urls

['/Bulten/DownloadIstatistikselTablo?p=ioIn886qTBBxZ04d6ADzoen12l8Jgf/nwNiesH5bOD9SEjfjmnV1Px0pu7U4F1jk',
 '/Bulten/DownloadIstatistikselTablo?p=IZarm6K/Mu88EoNXhQvRr76vq0y3fJQkvlrAo6FTUoooc3T79jyr8KRZcruUU/cd',
 '/Bulten/DownloadIstatistikselTablo?p=259m50DUpYS9GZbtWZO86fOJT4VR/jHbS7tW7w4V/TMJeY91SYAxPvxDfXevHD3/',
 '/Bulten/DownloadIstatistikselTablo?p=D3mYcO6Ez3eUFdjjsJhh/a12JDbZKIqUrf6bithoKW8Ca5RAJpzsGc4LbkBAJGFg',
 '/Bulten/DownloadIstatistikselTablo?p=FM9xB8s/0kXi7bPMZO4aPj/8noaXlpCCOUq3Tlvn8F0zGJ55JXjB/KfR04wday0r',
 '/Bulten/DownloadIstatistikselTablo?p=ma2CEj/8/LjUNf1YUZ24YprUgiFYCwHJyZqxtK97gRYwaQFP07WgfX3JqsVIFrjn',
 '/Bulten/DownloadIstatistikselTablo?p=8JRe7mLB4LmjiXhcouBrSvKE6N1JM9jgnsNU5t4KaMiRTaFj9LK63FlaLlWQJD6s',
 '/Bulten/DownloadIstatistikselTablo?p=i3s/mneG9GZcBEK341QG1uHvoBka1QOYpwGczUnrvp0q5OZG6vyvLQQArJdfoX4A',
 '/Bulten/DownloadIstatistikselTablo?p=VIcN9noxXmip4cOEboEzIr6NQNA5fD8j1GfDAMKfoVjcDcULZazVlIOgAxERXJbj',
 '/Bulten/DownloadIstatistikselTablo?p=OFuqphQ