In [None]:
### Project imports ###
import numpy as np
import pandas as pd
import psycopg2 as pg2
import matplotlib.pyplot as plt
import seaborn as sns

# Project overview
#### Objective: Create a historic daily labor demand index based of total available jobs for a given day between 2006-2020  
    - In this project, only job ads from Arbetsförmedlingen (JobTechDev) will be used. In a future project, current labor demand index will be constructed and supplemented with ads through the means of web scraping.
    
    - Performace of the index will be evaluated through comparison to Swedens official statistics (survey index)
#### Project steps
       - Step 1: Converting JSON files to CSV and extracting relevant data (+data cleaning)
       - Step 2: Inserting data into a postgreSQL database and setting up relevant query
       - Step 3: Visualizing and discussing results

#### Step 1: Converting JSON files to CSV and extracting relevant data (+ data cleaning)
#### All data used in this project is from [JobTechDev](https://jobtechdev.se/docs/apis/historical/)  
    - The complete dataset is about 30.8GB and files are in the JSON-format
    - Dataset contains 32 differerent columns/vars, many of which are beyond the scope of the project, as such a selection will be extracted

In [None]:
#all_dfs = []
for file in range(2006,2007):
    filename = f'/Users/Kevin/Desktop/project_dta/json_pb2006_2020/{file}.json'
    with open(filename) as f:
        ads = json.load(f)
        file_ads = []
    
    # For 2017 application_deadline is null, last_publication_date used as a proxy, vars 99.8% equivalent #
    for ad in ads:
            if file != 2017:
                ad_select = [ad['headline'], ad['number_of_vacancies'], ad['publication_date'], ad['application_deadline']]
                file_ads.append(ad_select)
            else:
                ad_select = [ad['headline'], ad['number_of_vacancies'], ad['publication_date'], ad['last_publication_date']]
                file_ads.append(ad_select)
    
    file_df = pd.DataFrame(file_ads, columns=('headline','number_of_vacancies', 'publication_date', 'application_deadline'))

    # Date for 2018-2020 includes time in the date columns, we slice to only keep the format 'YYYY-MM-DD' #
    if file in [2018, 2019, 2020]:
        file_df['publication_date'] = file_df['publication_date'].apply(lambda x:x[0:11])
        file_df['application_deadline'] = file_df['application_deadline'].apply(lambda x:x[0:11])
    #all_dfs.append(file_df)
    file_df.to_csv(f'/Users/Kevin/Desktop/project_dta/csv_pd2006_2020/{file}', index=False, index_label=False)    

In [None]:
#plt.style.use('seaborn')
#%config InlineBackend.figure_format = 'svg'
sns.heatmap(file_df.isnull(), cbar=False, xticklabels=['hl','vacant','post_d','dead_d'])

### Step 2: Inserting data into a postgreSQL database and setting up relevant query
    - To efficiently work with the data, it will be imported into postgreSQL for analysis, query results will then be analyzed in this notebook

In [None]:
### DB created in pgAdmin4 GUI, table specifications below ###
conn = pg2.connect(database='job_ads', user='postgres', password='World930920')
cur = conn.cursor()

In [None]:
### Table creation ###
cur.execute("""
CREATE TABLE historic_ads (
    ad_id SERIAL PRIMARY KEY,
    headline VARCHAR NOT NULL,
    number_of_vacancies INTEGER NOT NULL,
    publication_date DATE NOT NULL,
    application_deadline DATE NOT NULL);""")


#sqlCreateTable = "create table "+name_Table+" (id bigint, title varchar(128), summary varchar(256), story text);"

In [None]:
### fetch data from cursor, use tab to get all options (fetchone, fetchmany, fetchall)
data = cur.fetchall()

In [None]:
empt_list = []

In [None]:
### With the cursor execute some command

cur.execute(f"""
SELECT * FROM payment
WHERE payment_date BETWEEN '2005-01-01' AND '2007-05-01'
""")
data = cur.fetchall()
for i in data:
    ### We use tuple unpacking to get our vars
    (payment_id, customer_id, staff_id, rental_id, amount, payment_date) = i
    empt_list.append([payment_id, customer_id, staff_id, rental_id, amount, payment_date])


In [None]:
### List can then be stored as a new df
x = pd.DataFrame(empt_list)