In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [11]:
print("="*60)
print("SALARY PREDICTION MODEL - DETAILED ANALYSIS")
print("="*60)
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

SALARY PREDICTION MODEL - DETAILED ANALYSIS
Analysis started at: 2025-07-21 04:47:33



In [12]:
# Step 1: Load dataset
print("STEP 1: LOADING DATASET")
print("-" * 30)
df = pd.read_csv("Data/Salary Data.csv")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head(5))
print(f"\nDataset info:")
print(df.info())
print()

STEP 1: LOADING DATASET
------------------------------
Dataset shape: (375, 6)
Columns: ['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience', 'Salary']

First 5 rows:
    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gende

In [13]:
# Step 2: Data Quality Check
print("STEP 2: DATA QUALITY ANALYSIS")
print("-" * 35)
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

print("\nData types:")
print(df.dtypes)
print()

STEP 2: DATA QUALITY ANALYSIS
-----------------------------------
Missing values per column:
Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

Total missing values: 12

Data types:
Age                    float64
Gender                  object
Education Level         object
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object



In [14]:
# Step 3: Basic Statistics
print("STEP 3: DESCRIPTIVE STATISTICS")
print("-" * 35)
print("Numerical columns statistics:")
print(df.describe())
print()

print("Categorical columns value counts:")
categorical_columns = ["Gender", "Education Level", "Job Title"]
for col in categorical_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].value_counts())
print()

STEP 3: DESCRIPTIVE STATISTICS
-----------------------------------
Numerical columns statistics:
              Age  Years of Experience         Salary
count  373.000000           373.000000     373.000000
mean    37.431635            10.030831  100577.345845
std      7.069073             6.557007   48240.013482
min     23.000000             0.000000     350.000000
25%     31.000000             4.000000   55000.000000
50%     36.000000             9.000000   95000.000000
75%     44.000000            15.000000  140000.000000
max     53.000000            25.000000  250000.000000

Categorical columns value counts:

Gender:
Gender
Male      194
Female    179
Name: count, dtype: int64

Education Level:
Education Level
Bachelor's    224
Master's       98
PhD            51
Name: count, dtype: int64

Job Title:
Job Title
Director of Marketing              12
Director of Operations             11
Senior Business Analyst            10
Senior Marketing Analyst            9
Senior Marketing Manager

In [15]:
# Step 4: Salary Analysis
print("STEP 4: SALARY DISTRIBUTION ANALYSIS")
print("-" * 40)
if 'Salary' in df.columns:
    print(f"Salary statistics:")
    print(f"Mean: ${df['Salary'].mean():,.2f}")
    print(f"Median: ${df['Salary'].median():,.2f}")
    print(f"Standard Deviation: ${df['Salary'].std():,.2f}")
    print(f"Min: ${df['Salary'].min():,.2f}")
    print(f"Max: ${df['Salary'].max():,.2f}")
    print(f"25th percentile: ${df['Salary'].quantile(0.25):,.2f}")
    print(f"75th percentile: ${df['Salary'].quantile(0.75):,.2f}")
    print()

STEP 4: SALARY DISTRIBUTION ANALYSIS
----------------------------------------
Salary statistics:
Mean: $100,577.35
Median: $95,000.00
Standard Deviation: $48,240.01
Min: $350.00
Max: $250,000.00
25th percentile: $55,000.00
75th percentile: $140,000.00



In [16]:
# Step 5: Data Cleaning
print("STEP 5: DATA CLEANING")
print("-" * 25)
print(f"Rows before cleaning: {len(df)}")
print(f"Rows with missing salary: {df['Salary'].isnull().sum()}")

# Dropping rows where Salary is missing
df.dropna(subset=["Salary"], inplace=True)
print(f"Rows after cleaning: {len(df)}")
print()

STEP 5: DATA CLEANING
-------------------------
Rows before cleaning: 375
Rows with missing salary: 2
Rows after cleaning: 373



In [17]:
# Step 6: Feature Engineering
print("STEP 6: FEATURE PREPARATION")
print("-" * 32)
# Features and target
X = df.drop("Salary", axis=1)
y = df["Salary"]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Features: {list(X.columns)}")

# Column identification
categorical_cols = ["Gender", "Education Level", "Job Title"]
numerical_cols = ["Age", "Years of Experience"]

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")
print()

STEP 6: FEATURE PREPARATION
--------------------------------
Features shape: (373, 5)
Target shape: (373,)
Features: ['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']
Categorical columns: ['Gender', 'Education Level', 'Job Title']
Numerical columns: ['Age', 'Years of Experience']



In [18]:
# Step 7: Preprocessing Pipeline Setup
print("STEP 7: PREPROCESSING PIPELINE")
print("-" * 35)

categorical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy="median"))
])

preprocessor = ColumnTransformer([
    ('num', numerical_preprocessor, numerical_cols),
    ('cat', categorical_preprocessor, categorical_cols)
])

print("Preprocessing pipeline created:")
print("- Numerical: Median imputation")
print("- Categorical: Most frequent imputation + One-hot encoding")
print()

STEP 7: PREPROCESSING PIPELINE
-----------------------------------
Preprocessing pipeline created:
- Numerical: Median imputation
- Categorical: Most frequent imputation + One-hot encoding



In [19]:
# Step 8: Model Pipeline
print("STEP 8: MODEL PIPELINE CREATION")
print("-" * 35)
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2
    ))
])
print("Random Forest Regressor pipeline created with parameters:")
print("- n_estimators: 100")
print("- max_depth: 10")
print("- min_samples_split: 5")
print("- min_samples_leaf: 2")
print("- random_state: 42")
print()

STEP 8: MODEL PIPELINE CREATION
-----------------------------------
Random Forest Regressor pipeline created with parameters:
- n_estimators: 100
- max_depth: 10
- min_samples_split: 5
- min_samples_leaf: 2
- random_state: 42



In [20]:
# Step 9: Train-Test Split
print("STEP 9: TRAIN-TEST SPLIT")
print("-" * 30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Train ratio: {X_train.shape[0]/len(df)*100:.1f}%")
print(f"Test ratio: {X_test.shape[0]/len(df)*100:.1f}%")
print()

STEP 9: TRAIN-TEST SPLIT
------------------------------
Training set size: 298 samples
Test set size: 75 samples
Train ratio: 79.9%
Test ratio: 20.1%



In [None]:
# Step 10: Model Training
print("STEP 10: MODEL TRAINING")
print("-" * 25)
print("Training the model...")
model.fit(X_train, y_train)
print("Model training completed!")
print()

STEP 10: MODEL TRAINING
-------------------------
Training the model...
Model training completed!



In [22]:
# Step 11: Model Evaluation
print("STEP 11: MODEL EVALUATION")
print("-" * 30)

# Training predictions
train_preds = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
train_r2 = r2_score(y_train, train_preds)
train_mae = mean_absolute_error(y_train, train_preds)

# Test predictions
test_preds = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
test_r2 = r2_score(y_test, test_preds)
test_mae = mean_absolute_error(y_test, test_preds)

print("TRAINING SET PERFORMANCE:")
print(f"RMSE: ${train_rmse:,.2f}")
print(f"MAE: ${train_mae:,.2f}")
print(f"R² Score: {train_r2:.4f}")

print("\nTEST SET PERFORMANCE:")
print(f"RMSE: ${test_rmse:,.2f}")
print(f"MAE: ${test_mae:,.2f}")
print(f"R² Score: {test_r2:.4f}")

print(f"\nOverfitting Check:")
print(f"RMSE difference (train vs test): ${abs(train_rmse - test_rmse):,.2f}")
print(f"R² difference (train vs test): {abs(train_r2 - test_r2):.4f}")
print()

STEP 11: MODEL EVALUATION
------------------------------
TRAINING SET PERFORMANCE:
RMSE: $9,942.02
MAE: $6,566.33
R² Score: 0.9570

TEST SET PERFORMANCE:
RMSE: $17,080.61
MAE: $10,436.18
R² Score: 0.8783

Overfitting Check:
RMSE difference (train vs test): $7,138.58
R² difference (train vs test): 0.0787



In [23]:
# Step 12: Feature Importance
print("STEP 12: FEATURE IMPORTANCE ANALYSIS")
print("-" * 40)

# Get feature names after preprocessing
feature_names = (numerical_cols +
                 list(model.named_steps['preprocessor']
                      .named_transformers_['cat']
                      .named_steps['encoder']
                      .get_feature_names_out(categorical_cols)))

# Get feature importance
importance_scores = model.named_steps['regressor'].feature_importances_
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': importance_scores
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
for idx, row in feature_importance.head(10).iterrows():
    print(f"{row['feature']:<30}: {row['importance']:.4f}")
print()

STEP 12: FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 10 Most Important Features:
Age                           : 0.6098
Years of Experience           : 0.3370
Education Level_Bachelor's    : 0.0250
Education Level_PhD           : 0.0046
Education Level_Master's      : 0.0040
Job Title_Administrative Assistant: 0.0031
Gender_Female                 : 0.0031
Gender_Male                   : 0.0028
Job Title_Senior Project Manager: 0.0022
Job Title_Recruiter           : 0.0014



In [24]:
# Step 13: Prediction Examples
print("STEP 13: PREDICTION EXAMPLES")
print("-" * 32)
print("Sample predictions on test set:")
sample_indices = np.random.choice(len(X_test), 5, replace=False)

for i, idx in enumerate(sample_indices):
    actual = y_test.iloc[idx]
    predicted = test_preds[idx]
    error = abs(actual - predicted)
    error_pct = (error / actual) * 100

    print(f"\nSample {i+1}:")
    print(f"  Actual Salary: ${actual:,.2f}")
    print(f"  Predicted Salary: ${predicted:,.2f}")
    print(f"  Absolute Error: ${error:,.2f}")
    print(f"  Error Percentage: {error_pct:.1f}%")
print()

STEP 13: PREDICTION EXAMPLES
--------------------------------
Sample predictions on test set:

Sample 1:
  Actual Salary: $35,000.00
  Predicted Salary: $41,832.16
  Absolute Error: $6,832.16
  Error Percentage: 19.5%

Sample 2:
  Actual Salary: $50,000.00
  Predicted Salary: $50,683.88
  Absolute Error: $683.88
  Error Percentage: 1.4%

Sample 3:
  Actual Salary: $160,000.00
  Predicted Salary: $143,317.35
  Absolute Error: $16,682.65
  Error Percentage: 10.4%

Sample 4:
  Actual Salary: $90,000.00
  Predicted Salary: $89,959.44
  Absolute Error: $40.56
  Error Percentage: 0.0%

Sample 5:
  Actual Salary: $160,000.00
  Predicted Salary: $156,938.42
  Absolute Error: $3,061.58
  Error Percentage: 1.9%



In [25]:
# Step 14: Model Persistence
print("STEP 14: MODEL SAVING")
print("-" * 25)
model_filename = "salary_predictor_model_enhanced.joblib"
joblib.dump(model, model_filename)
print(f"Model saved as: {model_filename}")

# Save feature importance for later use
feature_importance.to_csv("feature_importance.csv", index=False)
print("Feature importance saved as: feature_importance.csv")

# Save model metrics
metrics = {
    'train_rmse': train_rmse,
    'test_rmse': test_rmse,
    'train_r2': train_r2,
    'test_r2': test_r2,
    'train_mae': train_mae,
    'test_mae': test_mae
}
pd.Series(metrics).to_csv("model_metrics.csv")
print("Model metrics saved as: model_metrics.csv")
print()

print("="*60)
print("MODEL ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Dataset: {len(df)} samples, {len(df.columns)} features")
print(f"Model Type: Random Forest Regressor")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Test RMSE: ${test_rmse:,.2f}")
print(f"Model saved successfully!")
print("Ready for Streamlit app deployment!")

STEP 14: MODEL SAVING
-------------------------
Model saved as: salary_predictor_model_enhanced.joblib
Feature importance saved as: feature_importance.csv
Model metrics saved as: model_metrics.csv

MODEL ANALYSIS COMPLETED SUCCESSFULLY!
Analysis completed at: 2025-07-21 04:50:45

FINAL SUMMARY
Dataset: 373 samples, 6 features
Model Type: Random Forest Regressor
Test R² Score: 0.8783
Test RMSE: $17,080.61
Model saved successfully!
Ready for Streamlit app deployment!
