In [None]:
### Project imports ###
import pandas as pd
import psycopg2 as pg2
import matplotlib.pyplot as plt
import json
import csv
import re
import calendar
from calendar import monthrange

# Project overview
#### Objective: Create a historic daily labor demand index based of total available jobs for a given day between 2006-2020      
- In this project, only job ads from Arbetsförmedlingen (JobTechDev) will be used. In a future project, current labor demand index will be constructed and supplemented with ads through the means of web scraping.
    
- Statistics from the European Commission[<sup>1</sup>](#fn1) shows that the percentage of job seekers using public employment services in EU-countries is among the highest in Sweden, above 70%. Thus, the index may be a good complement to official statistics in measuring labor demand.
    
- Performace of the index will be evaluated through comparison to Swedens official statistics (survey index).

#### Project steps
- Step 1: Converting JSON files to CSV and extracting relevant data (+data cleaning).
- Step 2: Inserting data into a postgreSQL database and setting up relevant query.
- Step 3: Visualizing and discussion of results.

<span id="fn1"> footnote 1</span>: European Commission (2017) [European Semester Thematic Factsheet](https://wayback.archive-it.org/12090/20201012083437/https://ec.europa.eu/info/sites/info/files/european-semester_thematic-factsheet_public-employment-services_en_0.pdf)

#### Step 1: Converting JSON files to CSV and extracting relevant data (+ data cleaning)
#### All data used in this project is from [JobTechDev](https://jobtechdev.se/docs/apis/historical/), an initiative by the Swedish public employment service.
- The complete dataset is about 30.8 GB and files are in the JSON-format
- Dataset contains 32 differerent columns/vars, many of which are beyond the scope of the project, as such a selection will be extracted.

- Variables to be extracted for this project are the following:
    - headline: The ad headline
    - number_of_vacancies: The number of advertised jobs for ad
    - publication_date: Date the ad was published on the job ad platform
    - application_deadline: The last date to apply for the job
    - last_publication_date: The last date the ad was public, used as substitutet for application deadline for 2017 where application_deadline is missing

In [None]:
total_valid = 0; total_errors = 0
for file in range(2015,2021):
    filename = f'/Users/Kevin/Desktop/project_dta/json_pb2006_2020/{file}.json'
    with open(filename) as f:
        ads = json.load(f)
        file_ads = []
        error_rows = 0
        
    # For 2017 application_deadline is null, last_publication_date used as a proxy, vars 99.8% equivalent #
    for ad in ads:
        # Removal of special characters, events of \n causes errors in csv file #
        head_line = re.sub('[!,*)@#%(&$_?.^\\\\\n/]', '', str(ad['headline']))
        if file != 2017:
            try:
                ad_select = [head_line, int(ad['number_of_vacancies']), ad['publication_date'][:10], ad['application_deadline'][:10]]
            except:
                error_rows += 1
                continue
        else:
            try:
                ad_select = [head_line, int(ad['number_of_vacancies']), ad['publication_date'][:10], ad['last_publication_date'][:10]]    
            except:
                error_rows += 1
                continue
                
        # Jobs ads with no vacancies or missing values in date/vacancies removed, headlines of less importance #
        if int(ad['number_of_vacancies']) > 0 and all(ad_select[1:]):
            file_ads.append(ad_select)
        else:
            error_rows += 1
            continue

    # We write the extracted data to a CSV-file for easy insertion into PostgreSQL #             
    with open(f'/Users/Kevin/Desktop/project_dta/csv_pd2006_2020/{file}.csv', mode='w', newline="") as file_writer:
        write = csv.writer(file_writer)
        for row in file_ads:
            write.writerow(row)
    print(f'vaild ads for {file}: ' + str(len(file_ads)))
    print(f'erroneous ads for {file}: ' + str(error_rows))
    total_valid += len(file_ads); total_errors += error_rows
print('\n')
print('total valid ads: ' + str(total_valid))
print('total erroneous ads: '+ str(total_errors))

* As the above output shows, we have a total of x ads between 2006-2020
* Between 2018-2020 data seems to have been cleaned before publication as few ads are erroneous

#### Step 2: Inserting data into a postgreSQL database and setting up relevant query
 - To efficiently work with the data, it will be imported into postgreSQL for analysis.

In [None]:
# DB created in pgAdmin4 GUI, login and connect cursor #
conn = pg2.connect(database='job_ads', user='postgres', password='********')
cur = conn.cursor()

In [None]:
# Table creation #
cur.execute("""
CREATE TABLE historic_final_f(
    ad_id SERIAL PRIMARY KEY,
    headline VARCHAR,
    number_of_vacancies INTEGER NOT NULL,
    publication_date DATE NOT NULL,
    application_deadline DATE NOT NULL);""")
# Changes to a DB must be followed witha a commit() statement, otherwise rollback() #
conn.commit()

In [None]:
# Importing csv files into PostgreSQL #
for file in range(2006, 2021):
    with open(f'/Users/Kevin/Desktop/project_dta/csv_pd2006_2020/{file}.csv', 'r') as csv_file:
        cur.copy_from(csv_file, 'historic_final_f', sep=',', columns=('headline', 'number_of_vacancies', 'publication_date', 'application_deadline'))
    conn.commit()

##### In the cell below we check, for each day between 2006 and 2020, how many ads were active the given day and then we sum the number och vacancies in these ads.

In [None]:
vacancies_per_day = []
for year in range(2006, 2021):
    for month in range(1,13):
        for day in range(1, calendar.monthrange(year, month)[1]):
            cur.execute(f"""
SELECT SUM(number_of_vacancies) FROM historic_final_f WHERE '{year}-{month}-{day}' >= publication_date AND '{year}-{month}-{day}' <= application_deadline;
""")
            data = cur.fetchone()
            ads_day = [f'{year}-{month}-{day}', int(data[0])]
            print(ads_day)
            vacancies_per_day.append(ads_day)

with open(f'/Users/Kevin/Desktop/project_dta/csv_pd2006_2020/results.csv', mode='w', newline="") as file_writer:
    write = csv.writer(file_writer)
    for row in vacancies_per_day:
        write.writerow(row)

### Step 3: Visualizing and discussion of results
#### As mentioned, the comparison data is from Statistics Sweden and can be collected 


In [None]:
%matplotlib inline
plt.style.use('seaborn')
%config InlineBackend.figure_format = 'svg'

fig, ax = plt.subplots(figsize=(5.5,3.5))

plt.plot(pd_result.iloc[:,0], pd_result.iloc[:,1])
plt.plot(pd_result.iloc[:,0], pd_result['mean'], alpha=0.4, color='r')
plt.xticks(date_index,rotation=90)

plt.title('Lediga jobb 01-31/05/2006', loc='left',fontweight='heavy')

# plt.savefig('/Users/Kevin/Desktop/Jobb.svg', bbox_inches='tight', facecolor=fig.get_facecolor(), edgecolor='none')
plt.show()