In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
import plotly.express as px 
import seaborn as sns
from plotly.offline import iplot , plot 
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")
path= "C:\\Users\\559816\\Documents\\Programming\\Hackahaton\\jobs_in_data.csv"
df = pd.read_csv(path, sep = ',')

In [22]:
unique_job_titles_count = df['job_title'].nunique()
unique_job_categories_count = df['job_category'].nunique()
total_salaries_2023 = df[df['work_year'] == 2023]['salary_in_usd'].sum()

fig1 = go.Figure(go.Indicator(
    mode="number",
    value=unique_job_titles_count,
    title="Job Titles",
    number={'font': {'color': 'lightblue'}}
))

fig2 = go.Figure(go.Indicator(
    mode="number",
    value=unique_job_categories_count,
    title="Job Categories",
    number={'font': {'color': 'lightblue'}}
))

fig3 = go.Figure(go.Indicator(
    mode="number",
    value=total_salaries_2023,
    title="Total Salaries in 2023",
    number={'prefix': "$", 'font': {'color': 'lightblue'}}
))

fig1.update_layout(
    width=800,
    height=300)
fig2.update_layout(
    width=800,
    height=300)
fig3.update_layout(
    width=800,
    height=300)

fig1.show()
fig2.show()
fig3.show()

In [23]:
work_year_counts = df['work_year'].value_counts().sort_index()
fig = px.line(
    x=work_year_counts.index,
    y=work_year_counts.values,
    markers=True,
    labels={'x': 'Year', 'y': 'Number of job positions'},
    title='Number of job positions by year'
)
fig.update_traces(line=dict(color='#c63256'))
fig.update_layout(template='plotly_dark')
fig.update_xaxes(tickvals=[2020, 2021, 2022, 2023])
fig.show()

In [52]:
# 1. Bar plot: Total of salaries by top 10 job titles
df_job_title_USD = df.groupby('job_title')['salary_in_usd'].sum().sort_values(ascending=True).reset_index()
top_10_job_titles = df_job_title_USD.head(10)

bar_fig1 = px.bar(
    top_10_job_titles,
    x='salary_in_usd',
    y='job_title',
    orientation='h',
    labels={'salary_in_usd': 'Salary in USD', 'job_title': 'Job Title'},
    title='Total of salaries by top 10 job titles',
    template='plotly_dark',
    text=top_10_job_titles['salary_in_usd'] / 1000,
    color='job_title',
    width=100
)
bar_fig1.update_layout(
    xaxis_title='Total Salary in USD',
    yaxis_title='Job Title',
    showlegend=False
)
bar_fig1.update_traces(texttemplate='$%{text:.0f}K', textposition='inside')

# 2. Bar plot: Total of salaries by top 10 job categories
df_job_category_USD = df.groupby('job_category')['salary_in_usd'].sum().sort_values(ascending=True).reset_index()
top_10_job_categories = df_job_category_USD.head(10)
bar_fig2 = px.bar(
    top_10_job_categories,
    x='salary_in_usd',
    y='job_category',
    orientation='h',
    labels={'salary_in_usd': 'Salary in USD', 'job_category': 'Job category'},
    title='Total of salaries by top 10 job categories',
    template='plotly_dark',
    text=top_10_job_categories['salary_in_usd'] / 1000,
    color='job_category'
)
bar_fig2.update_layout(
    xaxis_title='Total Salary in USD',
    yaxis_title='Job Category',
    showlegend=False
)
bar_fig2.update_traces(texttemplate='$%{text:.0f}K', textposition='inside')

fig = make_subplots(rows=1, cols=2, subplot_titles=("Total salaries by top 10 job titles", "Total salaries by top 10 job categories"))

for trace in bar_fig1['data']:
    fig.add_trace(trace, row=1, col=2)

for trace in bar_fig2['data']:
    fig.add_trace(trace, row=1, col=1)

fig.update_layout(
    height=1000,
    width = 2000,
    template='plotly_dark',
    showlegend=False
)
fig.show()

In [33]:
# 1. Bar plot: Average Salary per Job Category and Experience Level

avg_salary = df.groupby(['job_category', 'experience_level'])['salary_in_usd'].mean().sort_values(ascending=False).round().reset_index()

bar_fig = px.bar(
    avg_salary,
    x='job_category',
    y='salary_in_usd',
    color='experience_level',
    barmode='group',
    labels={'salary_in_usd': 'Average Salary in USD', 'job_category': 'Job Category', 'experience_level': 'Experience Level'},
    title='Average Salary per Job Category and Experience Level',
    template='plotly_dark',
    text=avg_salary['salary_in_usd'] / 1000
)
bar_fig.update_layout(
    width=1200,
    height=800,
    yaxis_title='Average Salary in USD',
    xaxis_title='Job Category'
)
bar_fig.update_yaxes(tickprefix='$', ticksuffix='K', tickformat=',.0f')
bar_fig.update_traces(texttemplate='$%{text:.0f}K', textposition='inside')

# 2. Box plot: Salary distribution per job category

box_fig = px.box(
    df,
    x='job_category',
    y='salary_in_usd',
    labels={'salary_in_usd': 'Salary in USD', 'job_category': 'Job Category'},
    title='Salary distribution per job category',
    template='plotly_dark'
)
box_fig.update_layout(
    width=1000, 
    height=600 
)

fig = make_subplots(rows=1, cols=2, subplot_titles=("Average Salary per Job Category and Experience Level", "Salary distribution per job category"))

for trace in bar_fig['data']:
    fig.add_trace(trace, row=1, col=1)

for trace in box_fig['data']:
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(
    height=800,
    width=1700,
    template='plotly_dark',
    showlegend=False
)
fig.show()


In [27]:
avg_salary_per_country = df.groupby('company_location')['salary_in_usd'].mean().reset_index().round(2)
avg_salary_per_country = avg_salary_per_country.sort_values(by='salary_in_usd', ascending=False)

fig = px.bar(
    avg_salary_per_country,
    x='company_location',
    y='salary_in_usd',
    labels={'salary_in_usd': 'Average Salary in USD', 'company_location': 'Country'},
    title='Average Salary in Data per country',
    template='plotly_dark',
    text=avg_salary_per_country['salary_in_usd'] / 1000
)
fig.update_layout(
    width=1500,
    height=1000,
    xaxis_tickangle=-45
)
fig.update_traces(texttemplate='$%{text:.0f}K', textposition='inside')
fig.show()

In [28]:
# 1. Bar plot: Number of positions by top 10 Job titles
df_job_title_count = df['job_title'].value_counts().sort_values(ascending=False).reset_index()
df_job_title_count.columns = ['job_title', 'count']
top_10_job_titles_count = df_job_title_count.head(10)

bar_fig1 = px.bar(
    top_10_job_titles_count,
    x='count',
    y='job_title',
    orientation='h',
    labels={'count': 'Number of job positions', 'job_title': 'Job Title'},
    title='Number of positions by top 10 Job titles',
    template='plotly_dark',
    color='job_title',
    text='count'
)
bar_fig1.update_layout(
    width=1200,
    height=600,
    xaxis_title='Number of Positions',
    yaxis_title='Job Title',
    showlegend=False
)

# 2. Bar plot: Number of job positions per country

country_counts = df['company_location'].value_counts().reset_index()
country_counts.columns = ['company_location', 'count']
filtered_country_counts = country_counts[country_counts['count'] >= 10]

bar_fig2 = px.bar(
    filtered_country_counts,
    x='company_location',
    y='count',
    labels={'count': 'Number of Positions', 'company_location': 'Country'},
    title='Number of job positions per country',
    template='plotly_dark',
    text='count'
)
bar_fig2.update_layout(
    width=1000,
    height=600
)

fig = make_subplots(rows=1, cols=2, subplot_titles=("Number of positions by top 10 Job titles", "Number of job positions per country"))

for trace in bar_fig1['data']:
    fig.add_trace(trace, row=1, col=1)

for trace in bar_fig2['data']:
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(
    height=800,
    width = 1700,
    template='plotly_dark',
    showlegend=False
)
fig.show()
