# Tech Job Market and Salaries Analysis 

For our final project, we have selected the Stack Overflow Developer Survey dataset, 
which contains detailed responses from developers regarding their job roles, skills, 
technologies used, and salary information. This dataset is particularly relevant to the 
tech industry, which is a major focus of our group, and will provide insights into the tech 
job market by collecting responses from developers worldwide. It covers various topics 
such as job roles, salary, coding activities, education, technology usage, and job 
satisfaction.<br>

Team Eyy<br>
Members:  
- Julianne Kristine D. Aban 
- Derich Andre G. Arcilla 
- Jennifer Bendoy 
- Richelle Ann C. Candidato 
- Marc Francis B. Gomolon 
- Phoebe Kae A. Plasus

##### Data Preparation

In [None]:
# place code here

##### Exploratory Data Analysis (EDA)

In [None]:
# place code here

##### Data Analysis Techniques

# K-means Clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
 
# File path to the cleaned survey data
file_path = 'cleaned_survey_results.csv'  
data = pd.read_csv(file_path)
 
# Columns to drop based on irrelevance or redundancy
columns_to_drop = ['ResponseId', 'Unnamed: 17', 'Currency']  # Excluded JobSat and Salary from this list
data_cleaned = data.drop(columns=columns_to_drop, errors='ignore')
 
# Selecting relevant columns for clustering
selected_columns = ['EdLevel', 'YearsCode', 'YearsCodePro']
data_numeric = data_cleaned[selected_columns]
 
# Handling missing values using mean imputation for numerical columns
imputer = SimpleImputer(strategy='most_frequent')  # Use 'most_frequent' for categorical data
data_imputed = pd.DataFrame(imputer.fit_transform(data_numeric), columns=data_numeric.columns)
 
# Converting categorical columns to numerical using Label Encoding
label_encoders = {}
for column in ['EdLevel', 'YearsCode', 'YearsCodePro']:
    label_encoders[column] = LabelEncoder()
    data_imputed[column] = label_encoders[column].fit_transform(data_imputed[column])
 
# Scaling the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_imputed)
 
# Finding the optimal number of clusters using the Elbow Method
inertia = []
range_k = range(1, 11)
for k in range_k:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_scaled)
    inertia.append(kmeans.inertia_)
 
# Plotting the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(range_k, inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()
 
# Applying K-Means with the chosen number of clusters
optimal_k = 4  # Adjust based on elbow curve results
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(data_scaled)
 
# Adding cluster labels to the dataset
data_imputed['Cluster'] = clusters
 
# Visualizing the clusters using a pair plot
data_imputed['Cluster'] = data_imputed['Cluster'].astype(str)  # Convert cluster labels to strings for visualization
sns.pairplot(data_imputed, hue='Cluster', diag_kind='kde', corner=True)
plt.show()
 
# Saving the clustered dataset to a new CSV file
data_imputed.to_csv('clustered_survey_data.csv', index=False)

# Linear Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('cleaned_file.csv')

print("Missing values in each column:")
print(df[['YearsCodePro', 'YearsCode', 'EdLevel', 'JobSat', 'DevType', 'Salary']].isnull().sum())

df = df.dropna(subset=['YearsCodePro', 'YearsCode', 'EdLevel', 'JobSat', 'DevType', 'Salary'])

print("\nColumns in the DataFrame:")
print(df.columns.tolist())

df.columns = df.columns.str.strip()

if 'Salary' not in df.columns:
    print("The column 'Salary' is not available in the DataFrame. Please check the column names.")
else:
    X = df[['YearsCodePro', 'YearsCode', 'EdLevel', 'JobSat', 'DevType']]  
    y = df['Salary']  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    preprocessor = ColumnTransformer(
        transformers=[ 
            ('num', 'passthrough', ['YearsCodePro', 'YearsCode', 'JobSat']),  
            ('cat', OneHotEncoder(), ['EdLevel', 'DevType'])  
        ])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', LinearRegression())])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'\nMean Squared Error: {mse}')
    print(f'R-squared: {r2}')

    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    coefficients = pipeline.named_steps['model'].coef_

    feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
    print("\nFeature Importance (Coefficients):")
    print(feature_importance.sort_values(by='Coefficient', ascending=False))

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual Salaries')
    plt.ylabel('Predicted Salaries')
    plt.title('Actual vs Predicted Salaries')
    plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')  
    plt.show()


# Apriori Algorithm

In [None]:
import pandas as pd #erase if being called already
from mlxtend.frequent_patterns import apriori, association_rules

# Load the cleaned dataset
df_cleaned = pd.read_csv('cleaned_survey_results.csv')


# Apriori Algorithm 1

# Define the columns to process (update based on your data)
columns_to_encode = [
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith',
    'WebframeHaveWorkedWith',
    'ToolsTechHaveWorkedWith',
    'DevType'
]

# Create a binary matrix
binary_df = pd.DataFrame()

for col in columns_to_encode:
    if col in df_cleaned.columns:
        # Split semi-colon-separated values and create a binary matrix
        split_data = df_cleaned[col].str.get_dummies(sep=';')
        binary_df = pd.concat([binary_df, split_data], axis=1)

# Convert the binary matrix to bool type
binary_df_bool = binary_df.astype(bool)

# Apply the Apriori algorithm using the bool DataFrame
frequent_itemsets = apriori(binary_df_bool, min_support=0.05, use_colnames=True)

# Calculate the total number of itemsets
num_itemsets = len(frequent_itemsets)

# Generate association rules, including the 'num_itemsets' parameter
rules = association_rules(frequent_itemsets, num_itemsets=num_itemsets, metric="lift", min_threshold=1.0)

# Sort and display the top rules
rules = rules.sort_values(by='lift', ascending=False)
print("Top 10 association rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


# This code identifies patterns in how developers use technologies like programming languages, databases, and frameworks. 
# By applying the Apriori algorithm, it reveals frequent combinations (e.g., Python with SQL) and strong associations between tools, 
# helping understand how technologies are commonly grouped in real-world usage.

In [None]:
# Apriori Algorithm 2

# Columns to analyze
employment_columns = ['Employment', 'RemoteWork', 'OrgSize']
tech_columns = [
    'LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
    'WebframeHaveWorkedWith', 'ToolsTechHaveWorkedWith'
]

# Convert binary-encoded dataframes to boolean
binary_employment = pd.get_dummies(df_cleaned[employment_columns], prefix=employment_columns).astype(bool)
binary_tech = pd.DataFrame()

for col in tech_columns:
    if col in df_cleaned.columns:
        split_data = df_cleaned[col].str.get_dummies(sep=';').astype(bool)
        binary_tech = pd.concat([binary_tech, split_data], axis=1)


# Combine employment and tech binary data
binary_data = pd.concat([binary_employment, binary_tech], axis=1)

# Apply Apriori algorithm
frequent_itemsets = apriori(binary_data, min_support=0.05, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, num_itemsets=num_itemsets, metric="lift", min_threshold=1.0)

# Filter and sort the rules
rules = rules.sort_values(by='lift', ascending=False)
print("Top 10 association rules for Employment and Technology:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


# This code explores relationships between employment factors (e.g., job type, remote work) and technology preferences. 
# It highlights how professional roles influence technology choices, such as remote workers preferring tools like Docker, 
# offering insights into technology trends based on work environments.