**Ask: What is the most optimal skill to learn for Data Analysts?**

*Methodology*

1. Continue from last notebook to find percent of postings with skill
2. Visualize median salary vs percent skill demand





In [12]:

# Importing Libraries
import ast
import pandas as pd
import seaborn as sns
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

In [2]:
#Overview of dataframe
df.describe()

Unnamed: 0,job_posted_date,salary_year_avg,salary_hour_avg
count,785741,22003.0,10662.0
mean,2023-06-25 16:02:11.860248576,123286.274072,47.016598
min,2023-01-01 00:00:04,15000.0,8.0
25%,2023-03-20 10:05:48,90000.0,27.5
50%,2023-06-29 06:11:38,115000.0,45.98
75%,2023-09-27 01:01:16,150000.0,61.159996
max,2023-12-31 23:59:58,960000.0,391.0
std,,48312.449482,21.890738


In [4]:
df['salary_hour_avg'].describe()

count    10662.000000
mean        47.016598
std         21.890738
min          8.000000
25%         27.500000
50%         45.980000
75%         61.159996
max        391.000000
Name: salary_hour_avg, dtype: float64

In [5]:
df['salary_year_avg'].mode()

0    90000.0
Name: salary_year_avg, dtype: float64

In [6]:
df['salary_year_avg'].idxmax()

554784

In [7]:
df.iloc[357614]

job_title_short                                          Data Analyst
job_title                Data Analyst | Financial Sector | Co. Dublin
job_location                                          Dublin, Ireland
job_via                                               via Recruit.net
job_schedule_type                                           Full-time
job_work_from_home                                              False
search_location                                               Ireland
job_posted_date                                   2023-01-17 23:30:25
job_no_degree_mention                                            True
job_health_insurance                                            False
job_country                                                   Ireland
salary_rate                                                      None
salary_year_avg                                                   NaN
salary_hour_avg                                                   NaN
company_name        

In [8]:
#How about the unique count of job titles?
df['job_title_short'].value_counts()

job_title_short
Data Analyst                 196075
Data Engineer                186241
Data Scientist               172286
Business Analyst              49063
Software Engineer             44929
Senior Data Engineer          44563
Senior Data Scientist         36957
Senior Data Analyst           29216
Machine Learning Engineer     14080
Cloud Engineer                12331
Name: count, dtype: int64

In [9]:
#Average yearly salary by job title
df.groupby('job_title_short')['salary_year_avg'].mean()

job_title_short
Business Analyst              91082.612833
Cloud Engineer               111268.453846
Data Analyst                  93841.907854
Data Engineer                130125.604250
Data Scientist               135988.837171
Machine Learning Engineer    126774.315972
Senior Data Analyst          113911.363665
Senior Data Engineer         145840.611624
Senior Data Scientist        154206.292996
Software Engineer            113393.760054
Name: salary_year_avg, dtype: float64

In [10]:
#Median of the salary by job title
df.groupby('job_title_short')['salary_year_avg'].median()

job_title_short
Business Analyst              85000.0
Cloud Engineer                90000.0
Data Analyst                  90000.0
Data Engineer                125000.0
Data Scientist               127500.0
Machine Learning Engineer    106415.0
Senior Data Analyst          111175.0
Senior Data Engineer         147500.0
Senior Data Scientist        155500.0
Software Engineer             99150.0
Name: salary_year_avg, dtype: float64

In [11]:
#Minimun and maximum values of the salary
df.groupby('job_title_short')['salary_year_avg'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
job_title_short,Unnamed: 1_level_1,Unnamed: 2_level_1
Business Analyst,16500.0,387460.0
Cloud Engineer,42000.0,280000.0
Data Analyst,25000.0,650000.0
Data Engineer,15000.0,525000.0
Data Scientist,27000.0,960000.0
Machine Learning Engineer,30000.0,325000.0
Senior Data Analyst,30000.0,425000.0
Senior Data Engineer,35000.0,425000.0
Senior Data Scientist,45000.0,890000.0
Software Engineer,28000.0,375000.0
