# Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mysql.connector
import json
from tqdm import tqdm

# Import data

In [None]:
# Read offers_canarias.json as a DataFrame in the folder data
df = pd.read_json('data/offers_canarias.json', encoding='utf-8')
# Add the column 'salaryOriginal' with NaN values
df['salaryOriginal'] = np.nan
# Define a custom function to access the 'label' key in a dictionary
def get_label(d):
    return d.get('label')
# Apply the custom function to the 'location' and 'category' columns
df['location'] = df['location'].apply(get_label)
df['category'] = df['category'].apply(get_label)
print(df.shape)
df.head()

# Test connection to database

In [None]:
# Test the connection to the database
config = {
    'user': 'root',
    'password': 'root',
    'host': 'localhost',
    'database': 'test',
    'port': '3306'
}

# SELECT * FROM test.jobmarket_canarias;
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute('SELECT * FROM test.ofertas_jobmarket_canarias_21_23')
print(cursor.fetchall())

cursor.execute('SELECT * FROM test.postings_jobmarket_canarias_21_23')
print(cursor.fetchall())

# Describe table

In [None]:
# Describe the table
cursor.execute('DESCRIBE test.ofertas_jobmarket_canarias_21_23')
# Print the result
describe = cursor.fetchall()
# Print the column names
columns = [column[0] for column in describe]

# Import data to database

In [None]:
# Create a list to store the values you want to insert into the postings table
postings_data = []
# Iterate over the rows in the DataFrame
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    # Create a list of dictionaries with the postings
    postings = row['postings']
    # Create an object with all the columns except "postings"
    row_data = row.drop('postings')
    row_data = row_data.to_dict()
    # Replace NaN values with None
    for key, value in row_data.items():
        if pd.isna(value):
            row_data[key] = None
    # Serialize the dictionary to a JSON string
    row_data_json = json.dumps(row_data)
    # Insert into the ofertas_jobmarket_canarias_21_23 table and get the id
    try:
        cursor.execute('INSERT INTO test.ofertas_jobmarket_canarias_21_23 ({}) VALUES ({})'.format(', '.join(row_data.keys()), ', '.join(['%s'] * len(row_data))), list(row_data.values()))
    except mysql.connector.errors.DataError:
        continue
    id = cursor.lastrowid
    # Create a for loop over the postings list
    for posting in postings:
        # Rename the actual id in posting to id_posting
        posting['id_posting'] = posting.pop('id')
        # Add the id of the ofertas_jobmarket_canarias_21_23 table to the posting
        posting['id'] = id
        # Verify if 'salaryOriginal' is in the posting
        if 'salaryOriginal' in posting:
            # Then update the record inserted in ofertas_jobmarket_canarias_21_23 and add the salaryOriginal
            cursor.execute('UPDATE test.ofertas_jobmarket_canarias_21_23 SET salaryOriginal = %s WHERE id = %s', (posting['salaryOriginal'], id))
        # Replace NaN values with None in posting
        for key, value in posting.items():
            if pd.isna(value):
                posting[key] = None
        # Append the posting data to the list
        postings_data.append(posting)
    cnx.commit()

In [None]:
# Iterate over the postings_data list with a tqdm progress bar
for posting in tqdm(postings_data, desc="Inserting postings"):
    try:
        # Flatten the nested dictionaries within the posting dictionary
        flattened_posting = {
            'date': posting['date'],
            'site_label': posting['site']['label'],
            'site_key': posting['site']['key'],
            'titleOriginal': posting['titleOriginal'],
            'url': posting['url'],
            'id_posting': posting['id_posting'],
            'id': posting['id']
        }

        # Convert the dictionary values into a list of values
        values = list(flattened_posting.values())
        
        # Generate placeholders for the SQL query
        placeholders = ', '.join(['%s'] * len(values))

        # Construct and execute the SQL query
        cursor.execute(f'INSERT INTO test.postings_jobmarket_canarias_21_23 ({columns}) VALUES ({placeholders})', values)
    except mysql.connector.errors.DataError:
        continue
    cnx.commit()


# Test MySQL table

In [None]:
import time

def timer(function=None, *args, **kwargs): 
    start = time.time()
    data = function(*args, **kwargs)
    end = time.time()
    print(f"Time elapsed: {end - start}")

def get_offers():
    cursor.execute('SELECT * FROM test.ofertas_jobmarket_canarias_21_23')
    data = cursor.fetchall()
    return data

def get_postings():
    cursor.execute('SELECT * FROM test.postings_jobmarket_canarias_21_23')
    data = cursor.fetchall()
    return data

def get_offers_and_postings():
    query = """
    SELECT * FROM test.ofertas_jobmarket_canarias_21_23 AS o
    INNER JOIN test.postings_jobmarket_canarias_21_23 AS p
    ON o.id = p.id
    """
    cursor.execute(query)
    data = cursor.fetchall()
    return data

def get_offer_by_id(id):
    query = f"SELECT * FROM test.ofertas_jobmarket_canarias_21_23 WHERE id = {id}"
    cursor.execute(query)
    data = cursor.fetchall()
    return data

timer(get_offers)

timer(get_postings)

timer(get_offers_and_postings)

timer(get_offer_by_id, 1)