Notebook for automating the parsing of labor force indicators from the excel document.

In [1]:
import pandas as pd

In [9]:
file_path = '../raw_data/sa_main_labour_force_indicators_15_over.xls'
xls = pd.ExcelFile(file_path)

# Get the sheet names
sheet_names = xls.sheet_names
sheet_names

['Tablo1']

In [10]:
# Load the data from the "Tablo1" sheet
df = pd.read_excel(xls, 'Tablo1')

# Display the first 10 rows transposed for easier viewing of the columns
df.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Mevsim etkisinden arındırılmış temel işgücü göstergeleri,Seasonally adjusted main labour force indicators,[15+ yaş - age],Yıllar - Years,Toplam - Total,2005,,,,,
Unnamed: 1,,,,,Ocak - January,Şubat - February,Mart - March,Nisan - April,Mayıs - May,Haziran - June
Unnamed: 2,,,15 ve daha yukarı yaştaki nüfus\nPopulation 15...,,47938,48015,48086,48163,48239,48317
Unnamed: 3,,,İşgücü\nLabour force,,21215,21120,21210,21270,21271,21329
Unnamed: 4,,,İstihdam\nEmployment,,19267,19180,19263,19279,19270,19326
Unnamed: 5,,,İşsiz\nUnemployment,,1949,1941,1947,1992,2001,2003
Unnamed: 6,,,İşgücüne dahil olmayanlar \nNot in lab...,,26723,26895,26876,26893,26968,26988
Unnamed: 7,,,İşgücüne katılma oranı\nLabour force participa...,,44.3,44,44.1,44.2,44.1,44.1
Unnamed: 8,,,İstihdam oranı \nEmployment rate ...,,40.2,39.9,40.1,40,39.9,40
Unnamed: 9,,(Bin kişi - Thousand person),İşsizlik oranı\nUnemployment rate \n(%),,9.2,9.2,9.2,9.4,9.4,9.4


It appears that the data we need starts from the 5th row, with the actual data values beginning from the 4th row. The data is bilingual, with English translations available. We'll use the English translations for consistency and readability.

Here are the steps we'll proceed with:

1. Set the 4th row as the header, using English translations of the column names.
2. Remove all rows above the new header.
3. Clean the column names to remove newline characters and extra spaces.
4. Inspect the cleaned data and identify any additional cleaning or transformation needed.

In [85]:
# Reload the data with the 3rd row as the header
df = pd.read_excel(xls, 'Tablo1', header=3)

# Display the first few rows of the updated DataFrame
df.head()

Unnamed: 0,Yıllar - Years,Unnamed: 1,15 ve daha yukarı yaştaki nüfus\nPopulation 15 years and over,İşgücü\nLabour force,İstihdam\nEmployment,İşsiz\nUnemployment,İşgücüne dahil olmayanlar \nNot in labour force,İşgücüne katılma oranı\nLabour force participation rate\n(%),İstihdam oranı \nEmployment rate \n(%),İşsizlik oranı\nUnemployment rate \n(%),...,Yıllar - Years.2,Unnamed: 23,15 ve daha yukarı yaştaki nüfus\nPopulation 15 years and over.2,İşgücü\nLabour force.2,İstihdam\nEmployment.2,İşsiz\nUnemployment.2,İşgücüne dahil olmayanlar \nNot in labour force.2,İşgücüne katılma oranı\nLabour force participation rate\n(%).2,İstihdam oranı \nEmployment rate \n(%).2,İşsizlik oranı\nUnemployment rate \n(%).2
0,Toplam - Total,,,,,,,,,,...,Kadın - Female,,,,,,,,,
1,2005,Ocak - January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2,...,2005,Ocak - January,24441.0,5191.0,4740.0,450.0,19250.0,21.2,19.4,8.7
2,,Şubat - February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2,...,,Şubat - February,24479.0,5135.0,4696.0,439.0,19344.0,21.0,19.2,8.5
3,,Mart - March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2,...,,Mart - March,24514.0,5152.0,4709.0,444.0,19361.0,21.0,19.2,8.6
4,,Nisan - April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4,...,,Nisan - April,24552.0,5191.0,4724.0,467.0,19361.0,21.1,19.2,9.0


In [86]:
# Clean the column names by keeping only the English parts and stripping extra spaces
df.columns = [col.split('\n')[1].strip() if '\n' in col else col for col in df.columns]

In [87]:
df.head()

Unnamed: 0,Yıllar - Years,Unnamed: 1,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate,...,Yıllar - Years.2,Unnamed: 23,Population 15 years and over.2,Labour force.2,Employment.2,Unemployment.2,Not in labour force.2,Labour force participation rate.1,Employment rate.1,Unemployment rate.1
0,Toplam - Total,,,,,,,,,,...,Kadın - Female,,,,,,,,,
1,2005,Ocak - January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2,...,2005,Ocak - January,24441.0,5191.0,4740.0,450.0,19250.0,21.2,19.4,8.7
2,,Şubat - February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2,...,,Şubat - February,24479.0,5135.0,4696.0,439.0,19344.0,21.0,19.2,8.5
3,,Mart - March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2,...,,Mart - March,24514.0,5152.0,4709.0,444.0,19361.0,21.0,19.2,8.6
4,,Nisan - April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4,...,,Nisan - April,24552.0,5191.0,4724.0,467.0,19361.0,21.1,19.2,9.0


In [88]:
# find the columns with no values in
df.columns[df.isnull().all()]

# Columns with no values separate the data columns between `Total`, `Male` and `Female`
# We only want `Total` columns, so we will drop all columns after the first column with no values

# select columns by name
df = df.loc[:, :df.columns[df.isnull().all()][0]]
df.drop(df.columns[df.isnull().all()][0], axis=1, inplace=True)

In [89]:
# Find the rows with no values in
bottom_index = df[df.isnull().all(axis=1)].index

# Row with no values separates bottom of data from data notes, so we can drop it and any rows below it
# drop rows at and after index `df[df.isnull().all(axis=1)].index`
# drop rows below `bottom_index`
df.drop(df.index[bottom_index[0]:], inplace=True)

In [90]:
df

Unnamed: 0,Yıllar - Years,Unnamed: 1,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate
0,Toplam - Total,,,,,,,,,
1,2005,Ocak - January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2
2,,Şubat - February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2
3,,Mart - March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2
4,,Nisan - April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4
...,...,...,...,...,...,...,...,...,...,...
219,,Mart - March (r),65257.0,34627.0,31162.0,3465.0,30630.0,53.1,47.8,10
220,,Nisan - April (r),65305.0,35156.0,31633.0,3523.0,30149.0,53.8,48.4,10
221,,Mayıs - May (r),65352.0,35034.0,31706.0,3328.0,30318.0,53.6,48.5,9.5
222,,Haziran - June (r),65400.0,34682.0,31353.0,3329.0,30718.0,53.0,47.9,9.6


In [91]:
# Drop the first row if there are a high percentage of null values
# If there are a high percentage of null values in the first row, it is likely that the first row is a title row
# and not a data row
if df.iloc[0].isnull().sum() / df.shape[1] > 0.5:
    df.drop(df.index[0], inplace=True)

In [92]:
df

Unnamed: 0,Yıllar - Years,Unnamed: 1,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate
1,2005,Ocak - January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2
2,,Şubat - February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2
3,,Mart - March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2
4,,Nisan - April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4
5,,Mayıs - May,48239.0,21271.0,19270.0,2001.0,26968.0,44.1,39.9,9.4
...,...,...,...,...,...,...,...,...,...,...
219,,Mart - March (r),65257.0,34627.0,31162.0,3465.0,30630.0,53.1,47.8,10
220,,Nisan - April (r),65305.0,35156.0,31633.0,3523.0,30149.0,53.8,48.4,10
221,,Mayıs - May (r),65352.0,35034.0,31706.0,3328.0,30318.0,53.6,48.5,9.5
222,,Haziran - June (r),65400.0,34682.0,31353.0,3329.0,30718.0,53.0,47.9,9.6


In [93]:
# rename the first & second column to `Year` and `Month`
df.rename(columns={df.columns[0]: 'Year', df.columns[1]: 'Month'}, inplace=True)

In [94]:
df.head()

Unnamed: 0,Year,Month,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate
1,2005.0,Ocak - January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2
2,,Şubat - February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2
3,,Mart - March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2
4,,Nisan - April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4
5,,Mayıs - May,48239.0,21271.0,19270.0,2001.0,26968.0,44.1,39.9,9.4


In [95]:
# The `Year` column only has values for every 12th row, so we will forward fill the values
df['Year'] = df['Year'].ffill()

In [96]:
# Let's clean the month column, remove any string before & including the `-` character
df['Month'] = df['Month'].str.split('-').str[1]

In [97]:
# Remove '(r)' from any strings in the `Month` column, and strip any extra spaces
df['Month'] = df['Month'].str.split('(').str[0].str.strip()

In [98]:
df

Unnamed: 0,Year,Month,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate
1,2005,January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2
2,2005,February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2
3,2005,March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2
4,2005,April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4
5,2005,May,48239.0,21271.0,19270.0,2001.0,26968.0,44.1,39.9,9.4
...,...,...,...,...,...,...,...,...,...,...
219,2023,March,65257.0,34627.0,31162.0,3465.0,30630.0,53.1,47.8,10
220,2023,April,65305.0,35156.0,31633.0,3523.0,30149.0,53.8,48.4,10
221,2023,May,65352.0,35034.0,31706.0,3328.0,30318.0,53.6,48.5,9.5
222,2023,June,65400.0,34682.0,31353.0,3329.0,30718.0,53.0,47.9,9.6


In [99]:
# Now, combine the `Year` and `Month` columns into a single `Date` column, and convert it to a datetime
df['Date'] = df['Year'].astype(str) + '-' + df['Month'].astype(str)
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%B')

# reordering the columns so that `Date` is the first column
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

# Drop the `Year` and `Month` columns
df.drop(['Year', 'Month'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['Year', 'Month'], axis=1, inplace=True)


In [100]:
df

Unnamed: 0,Date,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate
1,2005-01-01,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2
2,2005-02-01,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2
3,2005-03-01,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2
4,2005-04-01,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4
5,2005-05-01,48239.0,21271.0,19270.0,2001.0,26968.0,44.1,39.9,9.4
...,...,...,...,...,...,...,...,...,...
219,2023-03-01,65257.0,34627.0,31162.0,3465.0,30630.0,53.1,47.8,10
220,2023-04-01,65305.0,35156.0,31633.0,3523.0,30149.0,53.8,48.4,10
221,2023-05-01,65352.0,35034.0,31706.0,3328.0,30318.0,53.6,48.5,9.5
222,2023-06-01,65400.0,34682.0,31353.0,3329.0,30718.0,53.0,47.9,9.6


---

##### Extracting Series

In [101]:
# Get column names, except `Date`
cols = df.columns.tolist()[1:]

In [102]:
cols

['Population 15 years and over',
 'Labour force',
 'Employment',
 'Unemployment',
 'Not in labour force',
 'Labour force participation rate',
 'Employment rate',
 'Unemployment rate']

In [None]:
# Function for extracts and cleaning `Population 15 years and over` column

In [44]:
df

Unnamed: 0,Year,Month,Population 15 years and over,Labour force,Employment,Unemployment,Not in labour force,Labour force participation rate,Employment rate,Unemployment rate
1,2005,Ocak - January,47938.0,21215.0,19267.0,1949.0,26723.0,44.3,40.2,9.2
2,,Şubat - February,48015.0,21120.0,19180.0,1941.0,26895.0,44.0,39.9,9.2
3,,Mart - March,48086.0,21210.0,19263.0,1947.0,26876.0,44.1,40.1,9.2
4,,Nisan - April,48163.0,21270.0,19279.0,1992.0,26893.0,44.2,40.0,9.4
5,,Mayıs - May,48239.0,21271.0,19270.0,2001.0,26968.0,44.1,39.9,9.4
...,...,...,...,...,...,...,...,...,...,...
219,,Mart - March (r),65257.0,34627.0,31162.0,3465.0,30630.0,53.1,47.8,10
220,,Nisan - April (r),65305.0,35156.0,31633.0,3523.0,30149.0,53.8,48.4,10
221,,Mayıs - May (r),65352.0,35034.0,31706.0,3328.0,30318.0,53.6,48.5,9.5
222,,Haziran - June (r),65400.0,34682.0,31353.0,3329.0,30718.0,53.0,47.9,9.6
