## KDD Processing on Software Engineering Jobs in the US data
This data analysis goes through the KDD process of data analysis on the US software engineering jobs provided by kaggle

## Data Selection

In [1]:
# Importing necessary Python packages for Data Selection
import pandas as pd

In [2]:
# Reload the dataset and perform initial examination
file_path = '/content/us-software-engineer-jobs-zenrows.csv'
jobs_df = pd.read_csv(file_path)

In [3]:
# Initial examination of the dataset
jobs_df.info()
jobs_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58433 entries, 0 to 58432
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  58433 non-null  object 
 1   company                58430 non-null  object 
 2   salary                 18103 non-null  object 
 3   rating                 58433 non-null  float64
 4   review_count           58433 non-null  int64  
 5   types                  42577 non-null  object 
 6   location               58433 non-null  object 
 7   relative_time          58433 non-null  object 
 8   hires_needed           11857 non-null  object 
 9   hires_needed_exact     18439 non-null  object 
 10  urgently_hiring        58433 non-null  bool   
 11  remote_work_model      22804 non-null  object 
 12  snippet                58425 non-null  object 
 13  dradis_job             58433 non-null  bool   
 14  link                   58433 non-null  object 
 15  ne

Unnamed: 0,title,company,salary,rating,review_count,types,location,relative_time,hires_needed,hires_needed_exact,...,indeed_applyable,ad_id,remote_location,source_id,hiring_event_job,indeed_apply_enabled,job_location_postal,company_overview_link,activity_date,location_extras
0,Android Developer,Shockoe,,0.0,0,Full-time,"Richmond, VA",30+ days ago,,,...,False,,False,11706594,False,False,,,,
1,Web Developer,"Denios, Inc.","$45,000 - $55,000 a year",0.0,0,Full-time,"Louisville, KY",30+ days ago,ONE,1.0,...,True,370154371.0,False,11468052,False,True,40219.0,,Active 2 days ago,
2,Sr. Android Developer,The Cervantes Group,,4.1,22,Full-time,Remote,6 days ago,TWO_FOUR,2.0,...,True,,False,9570478,False,True,,/cmp/The-Cervantes-Group,Active 3 days ago,
3,Junior Software Engineer,Medical Knowledge Group,,0.0,0,Full-time,Remote,8 days ago,ONE,1.0,...,True,368864426.0,False,501562,False,True,,,Active 3 days ago,
4,"Cloud Engineer (Software Engineer Advanced, Ex...",Federal Reserve Bank of New York,,4.1,548,Full-time,"Kansas City, MO",2 days ago,,,...,False,,False,1439,False,False,,/cmp/Federal-Reserve-Bank-of-New-York,,


## Data Cleaning

In [7]:
# Drop columns that are irrelevant or have too many missing values for our analysis
columns_to_drop = ['ad_id', 'activity_date', 'location_extras', 'company_overview_link', 'job_location_postal', 'job_link', 'link', 'source_id']
jobs_df_cleaned = jobs_df.drop(columns=columns_to_drop)

In [8]:
jobs_df_cleaned['salary'].fillna('Unknown', inplace=True)
jobs_df_cleaned['types'].fillna('Not Specified', inplace=True)
jobs_df_cleaned['company'].fillna('Unknown', inplace=True)
jobs_df_cleaned['snippet'].fillna('No description available', inplace=True)

In [9]:
# Check for any more missing values
missing_values_summary = jobs_df_cleaned.isnull().sum()

missing_values_summary

title                       0
company                     0
salary                      0
rating                      0
review_count                0
types                       0
location                    0
relative_time               0
hires_needed            46576
hires_needed_exact      39994
urgently_hiring             0
remote_work_model       35629
snippet                     0
dradis_job                  0
new_job                     0
sponsored                   0
featured_employer           0
indeed_applyable            0
remote_location             0
hiring_event_job            0
indeed_apply_enabled        0
dtype: int64

## Data Preprocessing

In [4]:
# Importing necessary Python packages for Data Preprocessing
from sklearn.preprocessing import LabelEncoder

In [10]:
# Converting categorical features into numerical form using Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['title', 'company', 'types', 'location', 'relative_time', 'remote_work_model', 'snippet']
for column in categorical_columns:
    jobs_df_cleaned[column] = label_encoder.fit_transform(jobs_df_cleaned[column].astype(str))


In [11]:
# Feature Engineering: Creating new informative features
# Here, we create a feature indicating if the job allows remote work
jobs_df_cleaned['allows_remote'] = jobs_df_cleaned['remote_work_model'].apply(lambda x: 1 if x != 0 else 0)


In [12]:
# Data Scaling: Since the features are of different scales, we'll normalize the 'rating' and 'review_count'
jobs_df_cleaned['normalized_rating'] = (jobs_df_cleaned['rating'] - jobs_df_cleaned['rating'].min()) / (jobs_df_cleaned['rating'].max() - jobs_df_cleaned['rating'].min())
jobs_df_cleaned['normalized_review_count'] = (jobs_df_cleaned['review_count'] - jobs_df_cleaned['review_count'].min()) / (jobs_df_cleaned['review_count'].max() - jobs_df_cleaned['review_count'].min())


In [13]:
# Display some of the preprocessed data
jobs_df_cleaned.head()

Unnamed: 0,title,company,salary,rating,review_count,types,location,relative_time,hires_needed,hires_needed_exact,...,new_job,sponsored,featured_employer,indeed_applyable,remote_location,hiring_event_job,indeed_apply_enabled,allows_remote,normalized_rating,normalized_review_count
0,501,8405,Unknown,0.0,0,1,1687,24,,,...,False,False,False,False,False,False,False,1,0.0,0.0
1,17320,2840,"$45,000 - $55,000 a year",0.0,0,1,1128,24,ONE,1.0,...,False,True,False,True,False,False,True,1,0.0,0.0
2,15203,9404,Unknown,4.1,22,1,1667,27,TWO_FOUR,2.0,...,True,False,False,True,False,False,True,0,0.82,9.9e-05
3,5022,5928,Unknown,0.0,0,1,1667,29,ONE,1.0,...,False,True,True,True,False,False,True,0,0.0,0.0
4,1727,3583,Unknown,4.1,548,1,974,11,,,...,True,False,False,False,False,False,False,1,0.82,0.002454


## Data Transformation and Data Splitting

In [14]:
# Importing necessary Python packages for Data Transformation and Data Splitting
from sklearn.model_selection import train_test_split

In [15]:
feature_columns = ['title', 'company', 'types', 'location', 'relative_time', 'allows_remote', 'normalized_review_count']
X = jobs_df_cleaned[feature_columns]
y = jobs_df_cleaned['normalized_rating']

In [16]:
# Data Splitting: Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Display the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((46746, 7), (11687, 7), (46746,), (11687,))

## Data Mining and Model Evaluation

In [18]:
# Importing necessary Python packages for Data Mining and Model Evaluation
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [19]:
# Initialize models
linear_model = LinearRegression()
ridge_model = Ridge(alpha=0.1)
lasso_model = Lasso(alpha=0.1)

In [20]:
# Fit models on training data
linear_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)


In [21]:
# Make predictions on testing data
linear_pred = linear_model.predict(X_test)
ridge_pred = ridge_model.predict(X_test)
lasso_pred = lasso_model.predict(X_test)

In [22]:
# Evaluation Metrics
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, mae, r2

In [23]:
# Evaluate models
linear_metrics = evaluate_model(y_test, linear_pred)
ridge_metrics = evaluate_model(y_test, ridge_pred)
lasso_metrics = evaluate_model(y_test, lasso_pred)

In [24]:
# Create a DataFrame to summarize metrics
metrics_df = pd.DataFrame({
    'Metric': ['Mean Squared Error', 'Mean Absolute Error', 'R2 Score'],
    'Linear Regression': linear_metrics,
    'Ridge Regression': ridge_metrics,
    'Lasso Regression': lasso_metrics
})

metrics_df

Unnamed: 0,Metric,Linear Regression,Ridge Regression,Lasso Regression
0,Mean Squared Error,0.126738,0.126723,0.129397
1,Mean Absolute Error,0.316702,0.316704,0.324146
2,R2 Score,0.066428,0.066536,0.04684
