## Bar Graph of Count of Selected Skills in Job Postings for Top 3 Job Titles
This graph shows the count of selected skills (SQL, Excel, Python, Tableau, R, Power BI) in job postings for the job titles 'Data Analyst', 'Data Engineer', and 'Data Scientist'.

## Bar Graph of Top 3 Job Titles
This graph visualizes the three most popular job titles in the job listings, based on their frequency.

## Bar Graph of Top 5 Skills for Top 3 Job Titles
This graph displays the top 5 skills for each of the top 3 job titles based on their frequency in the job listings.

In [None]:
\list

In [None]:
SELECT keywords_all FROM public_job_listings.data_nerd_jobs LIMIT 100;

## Extracting Keywords from JSON
Extracting the entire `keywords_all` column and processing it in Python due to SQL extraction challenges.

In [None]:
SELECT keywords_all
FROM public_job_listings.data_nerd_jobs
LIMIT 100;

In [None]:
SELECT list.element AS unpacked_keywords
FROM  public_job_listings.data_nerd_jobs, UNNEST (keywords_all.list) AS  list
LIMIT 5;

## Bar Graph of Top 10 Skills
This graph visualizes the frequency of the top 10 skills found in the job listings.

In [None]:
SELECT list.element AS skill, COUNT(*) AS frequency
FROM public_job_listings.data_nerd_jobs, UNNEST(keywords_all.list) AS list
GROUP BY list.element
ORDER BY frequency DESC
LIMIT 10;

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data from the previous SQL query
skill_data = sql_df_jtsj

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='frequency', y='skill', data=skill_data, palette='Blues_r')
plt.title('Top 10 Skills in Job Listings')
plt.xlabel('Frequency')
plt.ylabel('Skill')
plt.show()

In [None]:
SELECT job_title_final, COUNT(*) AS job_count
FROM public_job_listings.data_nerd_jobs
GROUP BY job_title_final
ORDER BY job_count DESC
LIMIT 3;

In [None]:
WITH top_titles AS (
    SELECT job_title_final
    FROM public_job_listings.data_nerd_jobs
    GROUP BY job_title_final
    ORDER BY COUNT(*) DESC
    LIMIT 3
)
SELECT tt.job_title_final, list.element AS skill, COUNT(*) AS frequency
FROM public_job_listings.data_nerd_jobs
JOIN top_titles tt ON data_nerd_jobs.job_title_final = tt.job_title_final
CROSS JOIN UNNEST(keywords_all.list) AS list
GROUP BY tt.job_title_final, list.element
ORDER BY tt.job_title_final, frequency DESC;

In [None]:
import pandas as pd

# Transforming the data to get top 5 skills for each of the top 3 job titles
transformed_data = sql_df_wjqm.groupby('job_title_final').apply(lambda x: x.nlargest(5, 'frequency')).reset_index(drop=True)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='frequency', y='skill', hue='job_title_final', data=transformed_data, palette='Blues_r')
plt.title('Top 5 Skills for Top 3 Job Titles')
plt.xlabel('Frequency')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Calculating the percentage of each skill for the top 3 job titles
transformed_data['percent'] = transformed_data.groupby('job_title_final')['frequency'].apply(lambda x: x / x.sum() * 100)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=transformed_data, palette='Blues_r')
plt.title('Top 5 Skills for Top 3 Job Titles (Percentage)')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Ensuring top 5 skills for each job title
transformed_data = sql_df_wjqm.groupby('job_title_final').apply(lambda x: x.nlargest(5, 'frequency')).reset_index(drop=True)

# Adding missing skills for job titles with less than 5 skills
job_titles = transformed_data['job_title_final'].unique()
for title in job_titles:
    current_skills = transformed_data[transformed_data['job_title_final'] == title]['skill'].tolist()
    if len(current_skills) < 5:
        missing_skills = 5 - len(current_skills)
        for i in range(missing_skills):
            transformed_data = transformed_data.append({'job_title_final': title, 'skill': 'No additional skill', 'frequency': 0}, ignore_index=True)

# Calculating the percentage of each skill for the top 3 job titles
transformed_data['percent'] = transformed_data.groupby('job_title_final')['frequency'].apply(lambda x: x / x.sum() * 100)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=transformed_data, palette='Blues_r')
plt.title('Top 5 Skills for Top 3 Job Titles (Percentage)')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Filtering for specific skills
selected_skills = ['sql', 'excel', 'python', 'r', 'tableau', 'power bi']
filtered_data = transformed_data[transformed_data['skill'].isin(selected_skills)]

# Calculating the percentage of each skill for the top 3 job titles
filtered_data['percent'] = filtered_data.groupby('job_title_final')['frequency'].apply(lambda x: x / x.sum() * 100)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=filtered_data, palette='Blues_r')
plt.title('Selected Skills for Top 3 Job Titles (Percentage)')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Ensuring all selected skills are represented for each job title
for title in job_titles:
    for skill in selected_skills:
        if not ((filtered_data['job_title_final'] == title) & (filtered_data['skill'] == skill)).any():
            filtered_data = filtered_data.append({'job_title_final': title, 'skill': skill, 'frequency': 0}, ignore_index=True)

# Calculating the percentage of each skill for the top 3 job titles
filtered_data['percent'] = filtered_data.groupby('job_title_final')['frequency'].apply(lambda x: x / x.sum() * 100)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=filtered_data, palette='Blues_r')
plt.title('Selected Skills for Top 3 Job Titles (Percentage)')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
SELECT job_title_final, COUNT(*) AS count
FROM public_job_listings.data_nerd_jobs
GROUP BY job_title_final
ORDER BY count DESC
LIMIT 3;

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data from the previous SQL query
job_title_data = sql_df_mdrq

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='count', y='job_title_final', data=job_title_data, palette='Blues_r')
plt.title('Top 3 Job Titles in Job Listings')
plt.xlabel('Count')
plt.ylabel('Job Title')
plt.show()

In [None]:
WITH top_titles AS (
    SELECT job_title_final
    FROM public_job_listings.data_nerd_jobs
    GROUP BY job_title_final
    ORDER BY COUNT(*) DESC
    LIMIT 3
)
SELECT tt.job_title_final, list.element AS skill, COUNT(*) AS frequency
FROM public_job_listings.data_nerd_jobs
JOIN top_titles tt ON data_nerd_jobs.job_title_final = tt.job_title_final
CROSS JOIN UNNEST(keywords_all.list) AS list
GROUP BY tt.job_title_final, list.element
ORDER BY tt.job_title_final, frequency DESC;

In [None]:
# Calculating the percentage of each skill for the top 3 job titles
skill_data = sql_df_xley
skill_data['percent'] = skill_data.groupby('job_title_final')['frequency'].apply(lambda x: x / x.sum() * 100)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=skill_data, palette='Blues_r')
plt.title('Most Demanded Skills in Percentage for Top 3 Job Titles')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Keeping only the top 6 skills for each job title
top_skills_data = skill_data.groupby('job_title_final').apply(lambda x: x.nlargest(6, 'percent')).reset_index(drop=True)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=top_skills_data, palette='Blues_r')
plt.title('Top 6 Most Demanded Skills in Percentage for Top 3 Job Titles')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Filtering for specific skills
selected_skills = ['sql', 'excel', 'python', 'r', 'tableau', 'power bi']
filtered_skills_data = top_skills_data[top_skills_data['skill'].isin(selected_skills)]

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=filtered_skills_data, palette='Blues_r')
plt.title('Selected Skills in Percentage for Top 3 Job Titles')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
# Ensuring all selected skills are represented for each job title
for title in job_titles:
    for skill in selected_skills:
        if not ((filtered_skills_data['job_title_final'] == title) & (filtered_skills_data['skill'] == skill)).any():
            filtered_skills_data = filtered_skills_data.append({'job_title_final': title, 'skill': skill, 'frequency': 0}, ignore_index=True)

# Calculating the percentage of each skill for the top 3 job titles
filtered_skills_data['percent'] = filtered_skills_data.groupby('job_title_final')['frequency'].apply(lambda x: x / x.sum() * 100)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=filtered_skills_data, palette='Blues_r')
plt.title('Selected Skills in Percentage for Top 3 Job Titles (Adjusted)')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()

In [None]:
SELECT job_title_final, list.element AS skill, COUNT(*) AS count
FROM public_job_listings.data_nerd_jobs
JOIN UNNEST(keywords_all.list) AS list
WHERE job_title_final IN ('Data Analyst', 'Data Engineer', 'Data Scientist') AND list.element IN ('sql', 'excel', 'python', 'tableau', 'r', 'power bi')
GROUP BY job_title_final, list.element
ORDER BY job_title_final, list.element;

## Bar Graph of Most Demanded Skills in Percentage for Top 3 Job Titles
This graph shows the distribution of skills in percentage for the top three job titles, providing insights into the most demanded skills for each title.

In [None]:
# Summing up the counts for each skill across all job titles
total_skill_counts = skill_count_data.groupby('skill')['count'].sum().reset_index()

# Merging with the percentage data
merged_data = skill_count_data.merge(total_skill_counts, on='skill', suffixes=('', '_total'))

# Sorting by total count
merged_data = merged_data.sort_values(by='count_total', ascending=False)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='percent', y='skill', hue='job_title_final', data=merged_data, palette='Blues_r')
plt.title('Percentage of Selected Skills in Job Postings (Ordered by Total Demand)')
plt.xlabel('Percentage')
plt.ylabel('Skill')
plt.legend(title='Job Title')
plt.show()