In [2]:
import pandas as pd

# Load the CSV file
input_file = 'job_postings_by_sector_DE.csv'  # Replace with your file path
df = pd.read_csv(input_file)

# Convert the date column to datetime if it's not already in that format
df['date'] = pd.to_datetime(df['date'])

# Create a new column for the monthly data (only keep year and month)
df['Month'] = df['date'].dt.to_period('M')

# Group by display_name and Month, and calculate the average of indeed_job_postings_index
monthly_aggregated = df.groupby(['display_name', 'Month']).agg({
    'indeed_job_postings_index': 'mean'
}).reset_index()

# Convert the 'Month' column back to datetime, keeping the first day of each month
monthly_aggregated['date'] = monthly_aggregated['Month'].dt.to_timestamp()

# Add a constant column for jobcountry as 'DE'
monthly_aggregated['jobcountry'] = 'DE'

# Rearrange columns and rename them as required
output_df = monthly_aggregated[['date', 'jobcountry', 'indeed_job_postings_index', 'display_name']]

# Save the new DataFrame to a CSV file
output_file = 'job_postings_by_sector_DE_aggregated.csv'  # Output path
output_df.to_csv(output_file, index=False)

print(f"New CSV file created: {output_file}")


New CSV file created: job_postings_by_sector_DE_aggregated.csv


Data file: https://www.kaggle.com/datasets/kimminh21/job-postings/data 

indeed_job_postings_index: In the file you provided, the indeed_job_postings_index parameter seems to represent a normalized or indexed value indicating the number of job postings from Indeed for a specific sector. The value is indexed over time, with each row representing data for a given month and sector. An index value around 100 suggests a baseline or reference point, while values above or below this number suggest relative increases or decreases in job postings compared to that baseline.

For example: A value of 100.88 in February 2020 for "Accounting" suggests a level slightly above the baseline for job postings in that sector.

Step 1: Aggregating daily data to monthly data 
Step 2: job_posting.db created / not important as the CSV file will be added 
Step 3: 


In [3]:
import pandas as pd

# Load the dataset
file_path = 'job_postings_by_sector_DE_aggregated.csv'
df = pd.read_csv(file_path)

# Display basic information about the dataset
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Show the first few rows of the dataset
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296 entries, 0 to 2295
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   date                       2296 non-null   object 
 1   jobcountry                 2296 non-null   object 
 2   indeed_job_postings_index  2296 non-null   float64
 3   display_name               2296 non-null   object 
dtypes: float64(1), object(3)
memory usage: 71.9+ KB
None
       indeed_job_postings_index
count                2296.000000
mean                  142.647285
std                    46.061219
min                    37.538710
25%                   104.185968
50%                   142.528226
75%                   172.611452
max                   335.344667
date                         0
jobcountry                   0
indeed_job_postings_index    0
display_name                 0
dtype: int64
         date jobcountry  indeed_job_postings_index display_name
0

In [4]:

# creating a DB File to be imported 
import sqlite3

# Connect to SQLite (or any other SQL engine you are using)
conn = sqlite3.connect('job_postings.db')

# Load the dataset into SQL (this will create a table in the database)
df.to_sql('job_postings', conn, if_exists='replace', index=False)

# Confirm the table has been created and data inserted
query = "SELECT * FROM job_postings LIMIT 5;"
print(pd.read_sql(query, conn))

         date jobcountry  indeed_job_postings_index display_name
0  2020-02-01         DE                 100.881379   Accounting
1  2020-03-01         DE                  97.552258   Accounting
2  2020-04-01         DE                  82.365000   Accounting
3  2020-05-01         DE                  78.710645   Accounting
4  2020-06-01         DE                  79.868667   Accounting


In [5]:
import pandas as pd

# Step 1: Load the data from the CSV file
data = pd.read_csv('job_postings_by_sector_DE_aggregated.csv')

# Step 2: Filter the data for IT-related sectors
it_sectors = ['IT Operations & Helpdesk', 'Information Design & Documentation', 'Software Development']
filtered_data = data[data['display_name'].isin(it_sectors)]

# Step 3: Convert the 'date' column to datetime format
filtered_data['date'] = pd.to_datetime(filtered_data['date'])

# Step 4: Sort the data by 'display_name' and 'date'
filtered_data = filtered_data.sort_values(by=['display_name', 'date'])

# Step 5: Calculate the month-over-month percentage change
filtered_data['mom_percentage_change'] = filtered_data.groupby('display_name')['indeed_job_postings_index'].pct_change() * 100

# Step 6: Pivot the table to display each month as a column
pivot_table = filtered_data.pivot(index='display_name', columns='date', values='mom_percentage_change')

# Step 7: Rename columns to 'YYYY-MM MoM' format
pivot_table.columns = [f"{col.strftime('%Y-%m')} MoM" for col in pivot_table.columns]

# Step 8: Save the result to a CSV file (optional) or display it
pivot_table.to_csv('it_sectors_mom_percentage_change.csv')

# To display the result instead of saving:
print(pivot_table)


                                    2020-02 MoM  2020-03 MoM  2020-04 MoM  \
display_name                                                                
IT Operations & Helpdesk                    NaN    -2.329190   -14.790635   
Information Design & Documentation          NaN    -3.132472   -17.249557   
Software Development                        NaN    -3.927253   -14.060045   

                                    2020-05 MoM  2020-06 MoM  2020-07 MoM  \
display_name                                                                
IT Operations & Helpdesk              -1.554613     1.366314     2.055617   
Information Design & Documentation    -7.670945     0.828042     2.456450   
Software Development                  -3.786816     2.412760     2.380499   

                                    2020-08 MoM  2020-09 MoM  2020-10 MoM  \
display_name                                                                
IT Operations & Helpdesk               3.997910    -2.036803     2.585348 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['date'] = pd.to_datetime(filtered_data['date'])
