## Import Libraries

In [2]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

## Setting the Default Renderer

In [3]:
pio.renderers.default = 'iframe' 

In [4]:
df = pd.read_csv('../data/processed/DataScientist_cleaned.csv')

# Distribution of Salaries

In [9]:
# Distribution of Salaries, histogram
fig = px.histogram(df, x='Salary Estimate', title='Distribution of Salaries')
fig.show()

# Top 10 Most Frequently Used Words in the Job Description

In [10]:
# Top 10 Most Frequently Used Words in the Job Description
from collections import Counter
import re

# Combine all job descriptions into one string
all_descriptions = ' '.join(df['Job Description'])

# Tokenize and clean the text
words = re.findall(r'\b\w+\b', all_descriptions.lower())
common_words = Counter(words).most_common(10)

# Convert to DataFrame for plotting
common_words_df = pd.DataFrame(common_words, columns=['word', 'count'])

fig = px.bar(common_words_df, x='word', y='count', title='Top 10 Most Frequently Used Words')
fig.show()



## Top 10 Most Frequently Used Words in Job Descriptions (Excluding Common Words)

In [12]:
# Define the stop words to exclude
stop_words = set(['and', 'the', 'to', 'in', 'a', 'with', 'an', 'for', 'of', 'on', 'at', 'by', 'from', 'about', 'as', 'into', 'like', 'through', 'after', 'over', 'between', 'out', 'against', 'during', 'without', 'before', 'under','other','this', 'around','are','be','that','or', 'is','you','we','will','our', 'among'])

# Combining all job description words into a single string, separated by space
all_descriptions = ' '.join(df['Job Description']).lower()

# Remove non-alphabetic characters
all_descriptions = re.sub(r'[^a-z\s]', '', all_descriptions)

# Split into individual words
words = all_descriptions.split()

# Remove stop words
words = [word for word in words if word not in stop_words]

# Count words
word_counts = Counter(words)
common_words = word_counts.most_common(10)

# Separate words and counts into two separate tuples
words, counts = zip(*common_words)

# Convert to DataFrame
common_words_df = pd.DataFrame(common_words, columns=['word', 'count'])

fig = px.bar(common_words_df, x='word', y='count', title='Top 10 Most Frequently Used Words in Job Descriptions')
fig.show()

# Correlation between Ratings and Salaries

In [8]:
fig = px.scatter(df, x='Rating', y='Salary Estimate', title='Correlation between Ratings and Salaries')
fig.show()

# Correlation between Employee Number and Salaries

In [9]:
# Correlation between Employee Number and Salaries
fig = px.scatter(df, x='Size', y='Salary Estimate', title='Correlation between Employee Number and Salaries')
fig.show()