In [1]:
# Importing libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

  from .autonotebook import tqdm as notebook_tqdm


### Calculate projected salary avg for next year

In [2]:
# Filtering out NAN values

df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

### This is an alternative code to achieve the same result
# df.loc[pd.notna(df['salary_year_avg']), 'salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [3]:
# Create new dataframe to get rid of NAN values and avoid errors in new calculations
    # Always remember to use copy when creating an alteration of dataframe!!!
df_salary = df[pd.notna(df['salary_year_avg'])].copy()
# Creating function for inflation raises
def projected_salary(salary):
    return salary * 1.03

# Apply function to column, create new column to compare adjusted salary
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

# Show the two columns being compared
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [4]:
# Same thing as cell above but using an anonymous lambda function
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

# Check output
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [5]:
### Because this is a simple case you could actually use the line of code below to achieve the exact same result easier

# df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03

### Same thing to show result

# df_salary[['salary_year_avg', 'salary_year_inflated']]

In [6]:
df['job_skills'].head()

0                                                 None
1    ['r', 'python', 'sql', 'nosql', 'power bi', 't...
2    ['python', 'sql', 'c#', 'azure', 'airflow', 'd...
3    ['python', 'c++', 'java', 'matlab', 'aws', 'te...
4    ['bash', 'python', 'oracle', 'aws', 'ansible',...
Name: job_skills, dtype: object

In [7]:
### Turn job_skills from str to list
import ast

ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [8]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)



df['job_skills'] = df['job_skills'].apply(clean_list)

### Lambda version of function
# df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)

In [9]:
type(df['job_skills'][1])

list

In [None]:
### Creating function where "Senior" positions receive 5% yearly raise and base positions receive 3% yearly raise

def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return row['salary_year_avg'] * 1.05
    else:
        return row['salary_year_avg'] * 1.03


df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00
