In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

import pivot as

# Specify encoding when reading the CSV files
# Common encodings to try: 'latin1', 'ISO-8859-1', 'cp1252'
df = pd.read_csv('../data/raw/ai_job_dataset.csv')

df.head()

In [None]:
df.info()

In [None]:
type(df)

In [None]:
df.index

In [None]:
df.columns
print(df.columns)

In [None]:
df.duplicated().sum()
df.isna().sum()

In [None]:
def clean_all_columns(df):
    # Import numpy for NaN values
    import numpy as np
    
    # Loop through all columns in the dataframe
    for column in df.columns:
        # Convert column to string type
        df[column] = df[column].astype(str)
        # Strip whitespace and convert to uppercase
        df[column] = df[column].str.strip().str.upper()
        # Replace column name with NaN (assuming column names might be in the data)
        df[column] = df[column].replace(column.upper(), np.nan)
        # Also replace 'NAN' string with actual NaN value
        df[column] = df[column].replace('NAN', np.nan)
    
    return df

# Clean all columns
df = clean_all_columns(df)

# Example of checking unique values in a specific column


In [None]:
print(sorted(df['job_title'].dropna().unique()))

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(r'[\s/]+', '_', regex=True)

In [None]:
print(df.head())
print(df.isna().mean().round(2).sort_values(ascending=False).head(5))

In [None]:
top_roles = (
    df.groupby('job_title')               # group by job title
      .size()                             # count rows in each group
      .reset_index(name='postings')       # convert Series to DataFrame
      .sort_values(by='postings', ascending=False)
      .head(100)
)
print(top_roles)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get top 10 most common job titles
top_roles = (
    df['job_title']
    .value_counts()
    .head(10)
    .reset_index(name='postings')
    .rename(columns={'index': 'job_title'})
)

# Plot
sns.barplot(data=top_roles, y='job_title', x='postings', palette='viridis')
plt.title('Top 10 Advertised AI Roles (Oct 2024 – Jul 2025)')
plt.xlabel('Number of Job Postings')
plt.ylabel('Job Title')
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get top 10 most common experience levels
top_levels = (
    df['experience_level']
    .value_counts()
    .head(10)
    .reset_index(name='postings')
    .rename(columns={'index': 'experience_level'})
)

# Plot
sns.barplot(data=top_levels, y='experience_level', x='postings', palette='viridis')
plt.title('Top 10 Experience Levels (Oct 2024 – Jul 2025)')
plt.xlabel('Number of Job Postings')
plt.ylabel('Experience Level')
plt.tight_layout()
plt.show()


In [None]:
top_locations = df['company_location'].value_counts().head(10).index
filtered_df = df[df['company_location'].isin(top_locations)]

sns.countplot(data=filtered_df, y='company_location', hue='job_title', palette='Set2')
plt.title('Job Title Distribution by Top 10 Locations')
plt.xlabel('Number of Job Postings')
plt.ylabel('company_location')
plt.legend(title='job_title', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:


# Clean salary if not already done
df['salary_usd'] = pd.to_numeric(
    df['salary_usd'].astype(str).str.replace(r'[^\d.]', '', regex=True),
    errors='coerce'
)

# Pivot: Median salary per country
country_salary_pivot = pd.pivot_table(
    df,
    index='company_location',
    values='salary_usd',
    aggfunc='median'
).sort_values('salary_usd', ascending=False)

# Display top 20 countries
plt.figure(figsize=(10, 8))
sns.barplot(data=country_salary_pivot.head(10).reset_index(),
            y='company_location',
            x='salary_usd',
            palette='viridis')
plt.title("Top 10 Countries by Median AI Salary (USD)")
plt.xlabel("Median Salary (USD)")
plt.ylabel("Company Location")
plt.tight_layout()
plt.show()


In [None]:
pivot_exp_country = pd.pivot_table(
    df,
    index='company_location',
    columns='experience_level',
    values='salary_usd',
    aggfunc='median'
).round(0)

# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_exp_country,
            annot=True,
            fmt=".0f",
            cmap="YlOrBr",
            linewidths=0.5)
plt.title("Median Salary (USD) by Country and Experience Level")
plt.xlabel("Experience Level")
plt.ylabel("Company Location")
plt.tight_layout()
plt.show()


In [None]:

# ─────────────────────────────────────────────────────────────
#  Build pivot: rows = Job Title, cols = Country, values = median salary
# ─────────────────────────────────────────────────────────────
pivot = pd.pivot_table(
    subset,
    index='job_title',
    columns='company_location',
    values='salary_usd',
    aggfunc='median'
).round(0)              # whole-dollar medians

# ─────────────────────────────────────────────────────────────
#  Heat-map
# ─────────────────────────────────────────────────────────────
plt.figure(figsize=(14, 6))
sns.heatmap(
    pivot,
    annot=True,
    fmt='.0f',
    cmap='YlGnBu',
    linewidths=.5
)
plt.title('Median Salary (USD) - Top Job Titles × Top Countries')
plt.xlabel('Company Location')
plt.ylabel('Job Title')
plt.tight_layout()
plt.show()


In [None]:
pivot_exp_country = pd.pivot_table(
    df,
    index='idustry',
    columns='experience_level',
    values='salary_usd',
    aggfunc='median'
).round(0)

# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_exp_country,
            annot=True,
            fmt=".0f",
            cmap="YlOrBr",
            linewidths=0.5)
plt.title("Median Salary (USD) by Country and Experience Level")
plt.xlabel("Experience Level")
plt.ylabel("industry")
plt.tight_layout()
plt.show()