# Setup

In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import re
import pandas as pd

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jaeyoonlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jaeyoonlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaeyoonlee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaeyoonlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
file_path = "C:/Users/fairt/OneDrive/Desktop/Text Analytic/Project/cleaned_job_postings.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,title,description,company,salary,work_type,location,company_size,industry,experience_level,description_length,cleaned_description,cleaned_title
0,0,"Validation Engineer, Labware LIMS","Validation Engineer, Labware LIMSFoster City, ...","I.T. Solutions, Inc.",135200.0,CONTRACT,"Foster City, CA",3.0,IT Services and IT Consulting,Mid-Senior level,2084,validation engineer labware limsfoster city va...,validation engineer labware lims
1,1,Administrative Assistant - CONCUR,Global Financial Services firm is seeking an e...,ActOne Group,82500.0,FULL_TIME,"New York, NY",5.0,Information Services,Associate,1046,global financial service firm seek experienced...,administrative assistant concur
2,2,Customer Service Representative,We are seeking future agents to join our team!...,ABC Farigua Division,90000.0,FULL_TIME,"Greater Orlando, FL",3.0,Insurance,Entry level,1402,seek future agent join team look driven self-m...,customer service representative
3,3,Inbound Call Center Specialist,"Always Connecting, Always Evolving.\nIf you ar...",TECHEAD,38480.0,CONTRACT,"Richmond, VA",2.0,Staffing and Recruiting,Associate,3077,always connect always evolve look new opportun...,inbound call center specialist
4,4,Tool and Die Maker,Job Summary:The Tool and Die Maker will build ...,Prolink,69680.0,FULL_TIME,Cincinnati Metropolitan Area,4.0,Staffing and Recruiting,Associate,1067,job summary tool die maker build dy concept st...,tool die maker


In [6]:
df = df.dropna(subset=['cleaned_title'])
missing_values = df.isnull().sum()
print(missing_values)

Unnamed: 0             0
title                  0
description            0
company                0
salary                 0
work_type              0
location               0
company_size           0
industry               0
experience_level       0
description_length     0
cleaned_description    0
cleaned_title          0
dtype: int64


In [6]:
# Step 4: Create Custom Stopwords and Initialize Skill Categories
print("\nStep 4: Initialize Skill Categories and Stopwords")
print("---------------------------------------------")

# Add high-frequency but low-predictive words to stopwords
additional_stopwords = [
    'work', 'experience', 'team', 'include', 'service', 'customer',
    'provide', 'skill', 'year', 'job', 'support', 'opportunity',
    'position', 'business', 'employee', 'company', 'benefit',
    'require', 'ability', 'candidate', 'requirement', 'qualified',
    'responsibilities', 'duties', 'role',
    # Adding missing words from the second list
    'years', 'including', 'must', 'will', 'new', 'looking', 
    'seeking', 'ideal', 'competitive', 'excellent', 'strong', 
    'great', 'minimum', 'preferred', 'qualification', 'knowledge', 
    'related', 'professional', 'proficiency', 'based', 'remote', 
    'hybrid', 'office', 'requirements', 'skills'
]
# Combine with standard stopwords - convert to list
custom_stopwords = list(set(stopwords.words('english')).union(additional_stopwords))

skill_categories = {
    'Creative Arts': {
        'keywords': [
            'adobe creative suite', 'photoshop', 'illustrator', 'indesign',
            'graphic design', 'visual design', 'typography', 'art direction',
            'creative direction', 'brand design', 'illustration', 'adobe xd',
            'figma', 'sketch', 'color theory', 'layout design'
        ],
        'context_required': ['design', 'creative', 'art', 'visual']
    },
    'Digital Design': {
        'keywords': [
            'ui design', 'ux design', 'user interface', 'user experience',
            'wireframing', 'prototyping', 'responsive design', 'mobile design',
            'web design', 'interaction design', 'usability testing',
            'information architecture', 'figma', 'sketch'
        ],
        'context_required': ['design', 'user', 'interface', 'experience']
    },
    'Software Development': {
        'keywords': [
            'java', 'python', 'javascript', 'react', 'angular', 'node.js',
            'full stack', 'front end', 'back end', 'web development',
            'api development', 'cloud computing', r'\baws\b', 'azure',
            'devops', 'ci/cd', 'docker', 'kubernetes'
        ],
        'context_required': ['development', 'programming', 'software']
    },
    'Marketing': {
        'keywords': [
            'digital marketing', 'content marketing', r'\bseo\b', r'\bsem\b',
            'social media marketing', 'email marketing', 'marketing automation',
            'google analytics', 'conversion optimization', 'brand marketing',
            'marketing strategy', 'campaign management', 'hubspot', 'marketo'
        ],
        'context_required': ['marketing', 'digital']
    },
    'Project Management': {
        'keywords': [
            'project management', 'agile methodology', 'scrum master',
            'project planning', 'risk management', 'stakeholder management',
            'pmp certification', 'project coordination', 'jira', 'asana',
            'microsoft project', 'project lifecycle', 'change management'
        ],
        'context_required': ['project', 'management']
    },
    'Product Management': {
        'keywords': [
            'product strategy', 'product roadmap', 'product development',
            'product lifecycle', 'agile product', 'product owner', 'scrum',
            'market research', 'user stories', 'feature prioritization',
            'product metrics', 'product analytics', 'product launch'
        ],
        'context_required': ['product']
    },
    'Data Analysis': {
        'keywords': [
            'data analysis', 'statistical analysis', 'data visualization',
            'sql', 'python', 'r programming', 'tableau', 'power bi',
            'excel advanced', 'data modeling', 'regression analysis',
            'hypothesis testing', 'a/b testing', 'data mining'
        ],
        'context_required': ['data', 'analysis', 'analytics']
    },

    'Business Analysis': {
        'keywords': [
            'business analysis', 'requirements gathering', 'process mapping',
            'gap analysis', 'business process', 'system analysis',
            'functional requirements', 'business intelligence', 'data modeling',
            'process improvement', 'workflow optimization'
        ],
        'context_required': ['analysis', 'business']
    },
    'Financial': {
        'keywords': [
            'financial analysis', 'financial modeling', 'forecasting',
            'budgeting', 'variance analysis', 'cost analysis', 'pricing',
            'profit and loss', 'balance sheet', 'financial reporting',
            'risk assessment', 'investment analysis'
        ],
        'context_required': ['financial', 'finance']
    },
    'Sales': {
        'keywords': [
            'sales strategy', 'account management', 'sales forecasting',
            'crm', 'salesforce', 'sales operations', 'business development',
            'lead generation', 'pipeline management', 'contract negotiation',
            'sales analytics', 'territory management'
        ],
        'context_required': ['sales', 'revenue']
    }
}

print("Initialized skill categories:", list(skill_categories.keys()))

# Create vocabulary from skill categories
skill_vocabulary = set()
for category in skill_categories.values():
    skill_vocabulary.update(category['keywords'])
    skill_vocabulary.update(category['context_required'])

print(f"\nTotal skill-related terms in vocabulary: {len(skill_vocabulary)}")



Step 4: Initialize Skill Categories and Stopwords
---------------------------------------------
Initialized skill categories: ['Creative Arts', 'Digital Design', 'Software Development', 'Marketing', 'Project Management', 'Product Management', 'Data Analysis', 'Business Analysis', 'Financial', 'Sales']

Total skill-related terms in vocabulary: 156


## TF-IDF * Random Forest

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

class SalaryPredictionModel:
    def __init__(self):
        self.best_tfidf = None
        self.best_rf = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

    def create_tfidf_features(self, df):
        """
        Create TF-IDF features from text data using Ridge regression for faster tuning
        """
        # Combine text features
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # Create TF-IDF Pipeline with Ridge
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('ridge', Ridge(random_state=42, alpha=1.0))  # Using Ridge instead of RF
        ])
        
        # Parameter grid for TF-IDF only
        param_grid = {
            'tfidf__ngram_range': [(1,2),(1, 3)],
            'tfidf__min_df': [0.01, 0.05, 0.1, 0.2],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False]
        }
        
        # Tune TF-IDF parameters
        print("Tuning TF-IDF parameters...")
        grid = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,
            n_jobs=-1,
            verbose=1
        )
        grid.fit(combined_text, df['salary'])
        
        # Process full dataset
        print("\nProcessing full dataset with tuned parameters...")
        combined_text_full = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # Generate TF-IDF features
        self.best_tfidf = grid.best_estimator_.named_steps['tfidf']
        tfidf_matrix = self.best_tfidf.fit_transform(combined_text_full)
        tfidf_features = pd.DataFrame(
            tfidf_matrix.toarray(),
            columns=[f'tfidf_{f}' for f in self.best_tfidf.get_feature_names_out()]
        )
        
        # Create feature matrix
        feature_df = tfidf_features.copy()
        
        # Add encoded categorical features
        feature_df['experience_level_encoded'] = self.label_encoder.fit_transform(df['experience_level'])
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'])
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'])
        
        # Add location features
        feature_df['state'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
        feature_df['state_encoded'] = self.label_encoder.fit_transform(feature_df['state'])
        
        # Add interaction features
        feature_df['exp_industry'] = feature_df['experience_level_encoded'] * feature_df['industry_encoded']
        
        return feature_df, tfidf_matrix

    def plot_tfidf_distribution(self, tfidf_matrix, save_path=None):
        """
        Plot the TF-IDF word frequency distribution
        """
        # Calculate mean TF-IDF scores for each term
        mean_tfidf_scores = np.array(tfidf_matrix.mean(axis=0)).flatten()
        
        # Get feature names
        feature_names = self.best_tfidf.get_feature_names_out()
        
        # Create DataFrame with terms and their scores
        tfidf_dist_df = pd.DataFrame({
            'term': feature_names,
            'score': mean_tfidf_scores
        })
        
        # Sort by score and get top 30 terms
        top_terms = tfidf_dist_df.nlargest(30, 'score')
        
        # Create plot
        plt.figure(figsize=(15, 8))
        sns.barplot(data=top_terms, x='score', y='term', palette='viridis')
        
        plt.title('Top 30 Terms by Mean TF-IDF Score', fontsize=14, pad=20)
        plt.xlabel('Mean TF-IDF Score', fontsize=12)
        plt.ylabel('Term', fontsize=12)
        
        # Adjust layout
        plt.tight_layout()
        
        # Save if path provided
        if save_path:
            plt.savefig(save_path, bbox_inches='tight', dpi=300)
            plt.close()
        else:
            plt.show()

    def train_model(self, feature_matrix, target):
        """
        Train the final RandomForest model
        """
        # Prepare features
        X = feature_matrix.drop(['state'], axis=1)
        y = target

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.2, random_state=42
        )

        # Initialize and train the model with best parameters
        self.best_rf = RandomForestRegressor(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )
        self.best_rf.fit(X_train, y_train)

        # Make predictions and evaluate
        y_pred = self.best_rf.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.best_rf.feature_importances_
        }).sort_values('importance', ascending=False)

        return {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'feature_importance': feature_importance,
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        }

    def save_model(self, path_prefix='models/'):
        """
        Save all model components
        """
        joblib.dump(self.best_tfidf, f'{path_prefix}tfidf.joblib')
        joblib.dump(self.best_rf, f'{path_prefix}random_forest.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')

    def load_model(self, path_prefix='models/'):
        """
        Load all model components
        """
        self.best_tfidf = joblib.load(f'{path_prefix}tfidf.joblib')
        self.best_rf = joblib.load(f'{path_prefix}random_forest.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')

# Example usage:
if __name__ == "__main__":
    # Initialize model
    model = SalaryPredictionModel()
    
    # Create features
    print("Creating TF-IDF features...")
    feature_matrix, tfidf_matrix = model.create_tfidf_features(df)
    
    # Plot TF-IDF distribution
    print("\nPlotting TF-IDF word frequency distribution...")
    model.plot_tfidf_distribution(tfidf_matrix, save_path='tfidf_distribution.png')
    
    # Train model
    print("\nTraining Random Forest model...")
    results = model.train_model(feature_matrix, df['salary'])
    
    # Print results
    print(f"\nModel Performance:")
    print("-" * 50)
    print(f"Root Mean Squared Error: ${results['rmse']:.2f}")
    print(f"Mean Absolute Error: ${results['mae']:.2f}")
    print(f"R-squared Score: {results['r2']:.3f}")
    
    print("\nTop 15 Most Important Features:")
    print("-" * 50)
    print(results['feature_importance'].head(15).to_string())
    
    # Calculate percentage improvement over baseline
    baseline_rmse = np.sqrt(mean_squared_error(results['y_test'], 
                                             [results['y_test'].mean()] * len(results['y_test'])))
    improvement = ((baseline_rmse - results['rmse']) / baseline_rmse) * 100
    
    print(f"\nModel Improvement:")
    print("-" * 50)
    print(f"Baseline RMSE (mean prediction): ${baseline_rmse:.2f}")
    print(f"Model RMSE: ${results['rmse']:.2f}")
    print(f"Improvement over baseline: {improvement:.1f}%")

Creating TF-IDF features...
Tuning TF-IDF parameters...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Processing full dataset with tuned parameters...

Plotting TF-IDF word frequency distribution...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=top_terms, x='score', y='term', palette='viridis')



Training Random Forest model...

Model Performance:
--------------------------------------------------
Root Mean Squared Error: $27138.60
Mean Absolute Error: $20982.52
R-squared Score: 0.592

Top 15 Most Important Features:
--------------------------------------------------
                        feature  importance
5540   experience_level_encoded    0.026007
5544               exp_industry    0.014401
4561               tfidf_senior    0.013939
1183     tfidf_customer service    0.011960
4513       tfidf_school diploma    0.011654
1348              tfidf_develop    0.011347
2311          tfidf_high school    0.010397
1326               tfidf_design    0.010373
1686          tfidf_engineering    0.010032
1684             tfidf_engineer    0.009747
4512               tfidf_school    0.009735
1384              tfidf_diploma    0.009634
2312  tfidf_high school diploma    0.009161
2819                 tfidf_lift    0.007560
3881              tfidf_project    0.007423

Model Improvement:

## TF-IDF * Random Forest with skill categories BoW

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

class SalaryPredictionModel:
    def __init__(self):
        self.best_general_tfidf = None
        self.best_skill_tfidf = None
        self.best_rf = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

    def create_dual_tfidf_features(self, df, skill_categories):
        """
        Create TF-IDF features from both general text and skill-specific vocabulary
        """
        # Sample subset for initial tuning
        sample_size = int(len(df) * 0.2)
        df_sample = df.sample(n=sample_size, random_state=42)
        
        # Combine text features
        combined_text = df_sample['cleaned_title'] + ' ' + df_sample['cleaned_description']
        
        # 1. General TF-IDF Pipeline
        general_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # 2. Create skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        # Parameter grids for both TF-IDF vectorizers
        param_grid = {
            # TF-IDF parameters
            'tfidf__ngram_range': [(1, 2),(1,3)],
            'tfidf__min_df': [0.01, 0.02],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False],
            # RandomForest parameters
            'rf__n_estimators': [100, 200],
            'rf__max_depth': [10, 20],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # Tune general TF-IDF
        print("Tuning general TF-IDF parameters...")
        general_grid = GridSearchCV(
            general_pipeline,
            param_grid,
            cv=3,
            n_jobs=-1,
            verbose=1
        )
        general_grid.fit(combined_text, df_sample['salary'])
        
        # Create skill-specific pipeline with best general parameters
        best_params = general_grid.best_params_
        skill_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                vocabulary=list(skill_vocabulary),
                stop_words='english',
                ngram_range=best_params['tfidf__ngram_range'],
                min_df=best_params['tfidf__min_df'],
                max_df=best_params['tfidf__max_df'],
                binary=best_params['tfidf__binary']
            )),
            ('rf', RandomForestRegressor(
                n_estimators=best_params['rf__n_estimators'],
                max_depth=best_params['rf__max_depth'],
                min_samples_split=best_params['rf__min_samples_split'],
                min_samples_leaf=best_params['rf__min_samples_leaf'],
                max_features=best_params['rf__max_features'],
                random_state=42,
                n_jobs=-1
            ))
        ])
        
        # Process full dataset
        print("\nProcessing full dataset with tuned parameters...")
        combined_text_full = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # Generate general TF-IDF features
        self.best_general_tfidf = general_grid.best_estimator_.named_steps['tfidf']
        general_matrix = self.best_general_tfidf.fit_transform(combined_text_full)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.best_general_tfidf.get_feature_names_out()]
        )
        
        # Generate skill-specific TF-IDF features
        self.best_skill_tfidf = skill_pipeline.named_steps['tfidf']
        skill_matrix = self.best_skill_tfidf.fit_transform(combined_text_full)
        skill_features = pd.DataFrame(
            skill_matrix.toarray(),
            columns=[f'skill_{f}' for f in self.best_skill_tfidf.get_feature_names_out()]
        )
        
        # Combine features
        feature_df = pd.concat([general_features, skill_features], axis=1)
        
        # Add category-level aggregated features
        for category, info in skill_categories.items():
            keyword_cols = [col for col in skill_features.columns 
                          if any(keyword in col for keyword in info['keywords'])]
            context_cols = [col for col in skill_features.columns 
                          if any(context in col for context in info['context_required'])]
            
            if keyword_cols:
                feature_df[f'{category}_keyword_score'] = skill_features[keyword_cols].sum(axis=1) * 2
            if context_cols:
                feature_df[f'{category}_context_score'] = skill_features[context_cols].sum(axis=1)
        
        # Add encoded categorical features
        feature_df['experience_level_encoded'] = self.label_encoder.fit_transform(df['experience_level'])
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'])
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'])
        
        # Add location features
        feature_df['state'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
        feature_df['state_encoded'] = self.label_encoder.fit_transform(feature_df['state'])
        
        # Add interaction features
        feature_df['exp_industry'] = feature_df['experience_level_encoded'] * feature_df['industry_encoded']
        
        return feature_df

    def train_model(self, feature_matrix, target):
        """
        Train the final model using the best parameters
        """
        # Prepare features
        X = feature_matrix.drop(['state'], axis=1)
        y = target

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.2, random_state=42
        )

        # Initialize and train the model with best parameters
        self.best_rf = RandomForestRegressor(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )
        self.best_rf.fit(X_train, y_train)

        # Make predictions and evaluate
        y_pred = self.best_rf.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.best_rf.feature_importances_
        }).sort_values('importance', ascending=False)

        return {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'feature_importance': feature_importance,
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        }

    def save_model(self, path_prefix='models/'):
        """
        Save all model components
        """
        joblib.dump(self.best_general_tfidf, f'{path_prefix}general_tfidf.joblib')
        joblib.dump(self.best_skill_tfidf, f'{path_prefix}skill_tfidf.joblib')
        joblib.dump(self.best_rf, f'{path_prefix}random_forest.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')

    def load_model(self, path_prefix='models/'):
        """
        Load all model components
        """
        self.best_general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.best_skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.best_rf = joblib.load(f'{path_prefix}random_forest.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')

# Example usage:
if __name__ == "__main__":
    # Initialize model
    model = SalaryPredictionModel()
    
    # Assuming you have your data and skill categories ready
    # df = your_dataframe
    # skill_categories = your_skill_categories
    
    # Create features
    feature_matrix = model.create_dual_tfidf_features(df, skill_categories)
    
    # Train model
    results = model.train_model(feature_matrix, df['salary'])
    
    # Print results
    print(f"\nModel Performance:")
    print(f"Root Mean Squared Error: ${results['rmse']:.2f}")
    print(f"Mean Absolute Error: ${results['mae']:.2f}")
    print(f"R-squared Score: {results['r2']:.3f}")
    
    print("\nTop 15 Most Important Features:")
    print(results['feature_importance'].head(15))

Tuning general TF-IDF parameters...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Processing full dataset with tuned parameters...

Model Performance:
Root Mean Squared Error: $26377.55
Mean Absolute Error: $20181.65
R-squared Score: 0.615

Top 15 Most Important Features:
                       feature  importance
2809  experience_level_encoded    0.021663
2813              exp_industry    0.018380
1127       general_high school    0.017697
665            general_diploma    0.013972
2142    general_school diploma    0.011931
2168            general_senior    0.011393
570   general_customer service    0.011007
2141            general_school    0.010748
815           general_engineer    0.010304
634             general_design    0.010099
2312         general_strategic    0.009397
648            general_develop    0.009340
816        general_engineering    0.009088
737               general_duty    0.008476
1368              general_lift    0.008129


## TF-IDF * Random Forest with Separate Hyperparameter Tuning for general/skill-specific parameter

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

class SalaryPredictionModel:
    def __init__(self):
        self.best_general_tfidf = None
        self.best_skill_tfidf = None
        self.best_rf = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.cv = KFold(n_splits=5, shuffle=True, random_state=42)

    def create_dual_tfidf_features(self, df, skill_categories):
        """
        Create TF-IDF features from both general text and skill-specific vocabulary with separate grid searches
        and cross validation using R-squared as the scoring metric
        """
        # Sample subset for initial tuning
        sample_size = int(len(df) * 0.2)
        df_sample = df.sample(n=sample_size, random_state=42)
        
        # Combine text features
        combined_text = df_sample['cleaned_title'] + ' ' + df_sample['cleaned_description']
        
        # 1. General TF-IDF Pipeline
        general_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words="english")),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # General TF-IDF parameter grid
        general_param_grid = {
            'tfidf__ngram_range': [(1, 2), (1, 3)],
            'tfidf__min_df': [0.01, 0.02, 0.05],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False, True],
            'rf__n_estimators': [100],
            'rf__max_depth': [10],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # 2. Create skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        # Skill-specific TF-IDF Pipeline
        skill_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                vocabulary=list(skill_vocabulary),
                stop_words = "english"
            )),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # Skill-specific parameter grid
        skill_param_grid = {
            'tfidf__ngram_range': [(1, 2), (1, 3)],
            'tfidf__min_df': [0.01, 0.02, 0.03],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False, True],
            'rf__n_estimators': [100],
            'rf__max_depth': [10],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # Tune general TF-IDF with cross validation using R-squared
        print("Tuning general TF-IDF parameters...")
        general_grid = GridSearchCV(
            general_pipeline,
            general_param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'  # Changed to R-squared
        )
        general_grid.fit(combined_text, df_sample['salary'])
        
        # Print cross validation results for general TF-IDF
        print("\nGeneral TF-IDF Cross Validation Results:")
        cv_results = pd.DataFrame(general_grid.cv_results_)
        print(f"Best CV R-squared Score: {general_grid.best_score_:.3f}")
        print(f"Standard Deviation: {cv_results.loc[general_grid.best_index_, 'std_test_score']:.3f}")
        
        # Tune skill-specific TF-IDF with cross validation
        print("\nTuning skill-specific TF-IDF parameters...")
        skill_grid = GridSearchCV(
            skill_pipeline,
            skill_param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'  # Changed to R-squared
        )
        skill_grid.fit(combined_text, df_sample['salary'])
        
        # Print cross validation results for skill TF-IDF
        print("\nSkill TF-IDF Cross Validation Results:")
        cv_results = pd.DataFrame(skill_grid.cv_results_)
        print(f"Best CV R-squared Score: {skill_grid.best_score_:.3f}")
        print(f"Standard Deviation: {cv_results.loc[skill_grid.best_index_, 'std_test_score']:.3f}")
        
        # Process full dataset
        print("\nProcessing full dataset with tuned parameters...")
        combined_text_full = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # Generate general TF-IDF features
        self.best_general_tfidf = general_grid.best_estimator_.named_steps['tfidf']
        general_matrix = self.best_general_tfidf.fit_transform(combined_text_full)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.best_general_tfidf.get_feature_names_out()]
        )
        
        # Generate skill-specific TF-IDF features
        self.best_skill_tfidf = skill_grid.best_estimator_.named_steps['tfidf']
        skill_matrix = self.best_skill_tfidf.fit_transform(combined_text_full)
        skill_features = pd.DataFrame(
            skill_matrix.toarray(),
            columns=[f'skill_{f}' for f in self.best_skill_tfidf.get_feature_names_out()]
        )
        
        # Print best parameters for both grids
        print("\nBest General TF-IDF Parameters:")
        print(general_grid.best_params_)
        print("\nBest Skill TF-IDF Parameters:")
        print(skill_grid.best_params_)
        
        # Combine features
        feature_df = pd.concat([general_features, skill_features], axis=1)
        
        # Add category-level aggregated features
        for category, info in skill_categories.items():
            keyword_cols = [col for col in skill_features.columns 
                          if any(keyword in col for keyword in info['keywords'])]
            context_cols = [col for col in skill_features.columns 
                          if any(context in col for context in info['context_required'])]
            
            if keyword_cols:
                feature_df[f'{category}_keyword_score'] = skill_features[keyword_cols].sum(axis=1) * 2
            if context_cols:
                feature_df[f'{category}_context_score'] = skill_features[context_cols].sum(axis=1)
        
        # Add encoded categorical features
        feature_df['experience_level_encoded'] = self.label_encoder.fit_transform(df['experience_level'])
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'])
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'])
        
        # Add location features
        feature_df['state'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
        feature_df['state_encoded'] = self.label_encoder.fit_transform(feature_df['state'])
        
        # Add interaction features
        feature_df['exp_industry'] = feature_df['experience_level_encoded'] * feature_df['industry_encoded']
        
        return feature_df

    def train_model(self, feature_matrix, target):
        """
        Train the final model using cross validation with R-squared as the main metric
        """
        # Prepare features
        X = feature_matrix.drop(['state'], axis=1)
        y = target

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Define parameter grid for final model
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'max_features': ['sqrt', 'log2']
        }

        # Perform grid search with cross validation using R-squared
        rf_grid = GridSearchCV(
            RandomForestRegressor(random_state=42, n_jobs=-1),
            param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'  # Changed to R-squared
        )
        
        rf_grid.fit(X_scaled, y)
        
        # Store best model
        self.best_rf = rf_grid.best_estimator_

        # Perform cross validation on best model
        r2_scores = cross_val_score(
            self.best_rf,
            X_scaled,
            y,
            cv=self.cv,
            scoring='r2',
            n_jobs=-1
        )
        
        # Also calculate RMSE for reference
        rmse_scores = np.sqrt(-cross_val_score(
            self.best_rf,
            X_scaled,
            y,
            cv=self.cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1
        ))
        
        # Calculate feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.best_rf.feature_importances_
        }).sort_values('importance', ascending=False)

        # Final evaluation metrics
        final_results = {
            'cv_r2_mean': r2_scores.mean(),
            'cv_r2_std': r2_scores.std(),
            'cv_rmse_mean': rmse_scores.mean(),
            'cv_rmse_std': rmse_scores.std(),
            'best_params': rf_grid.best_params_,
            'feature_importance': feature_importance,
            'cv_r2_scores': r2_scores,
            'cv_rmse_scores': rmse_scores
        }

        return final_results

    def save_model(self, path_prefix='models/'):
        """
        Save all model components
        """
        joblib.dump(self.best_general_tfidf, f'{path_prefix}general_tfidf.joblib')
        joblib.dump(self.best_skill_tfidf, f'{path_prefix}skill_tfidf.joblib')
        joblib.dump(self.best_rf, f'{path_prefix}random_forest.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')

    def load_model(self, path_prefix='models/'):
        """
        Load all model components
        """
        self.best_general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.best_skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.best_rf = joblib.load(f'{path_prefix}random_forest.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')

# Example usage:
if __name__ == "__main__":
    # Initialize model
    model = SalaryPredictionModel()
    
    # Assuming you have your data and skill categories ready
    # df = your_dataframe
    # skill_categories = your_skill_categories
    
    # Create features
    feature_matrix = model.create_dual_tfidf_features(df, skill_categories)
    
    # Train model with cross validation
    results = model.train_model(feature_matrix, df['salary'])
    
    # Print results
    print(f"\nModel Performance:")
    print(f"Mean R-squared across folds: {results['cv_r2_mean']:.3f}")
    print(f"Standard Deviation of R-squared: {results['cv_r2_std']:.3f}")
    print(f"Mean RMSE across folds: ${results['cv_rmse_mean']:.2f}")
    print(f"Standard Deviation of RMSE: ${results['cv_rmse_std']:.2f}")
    print(f"\nBest Parameters:")
    print(results['best_params'])
    
    print("\nTop 15 Most Important Features:")
    print(results['feature_importance'].head(15))

Tuning general TF-IDF parameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


General TF-IDF Cross Validation Results:
Best CV R-squared Score: 0.457
Standard Deviation: 0.010

Tuning skill-specific TF-IDF parameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Skill TF-IDF Cross Validation Results:
Best CV R-squared Score: 0.288
Standard Deviation: 0.025

Processing full dataset with tuned parameters...

Best General TF-IDF Parameters:
{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__binary': False, 'tfidf__max_df': 0.9, 'tfidf__min_df': 0.05, 'tfidf__ngram_range': (1, 2)}

Best Skill TF-IDF Parameters:
{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__binary': True, 'tfidf__max_df': 0.8, 'tfidf__min_df': 0.01, 'tfidf__ngram_range': (1, 2)}
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Model Performance:
Mean R-squared across folds: 0.623
Standard D

## TF-IDF * Random Forest with Additional Stopwords

In [129]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
from nltk.corpus import stopwords

class SalaryPredictionModel:
    def __init__(self):
        self.best_general_tfidf = None
        self.best_skill_tfidf = None
        self.best_rf = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.cv = KFold(n_splits=5, shuffle=True, random_state=42)
        
        # Define high-frequency but low-predictive words
        additional_stopwords = [
            'work', 'experience', 'team', 'include', 'service', 'customer',
            'provide', 'skill', 'year', 'job', 'support', 'opportunity',
            'position', 'business', 'employee', 'company', 'benefit',
            'require', 'ability', 'candidate', 'requirement', 'qualified',
            'responsibilities', 'duties', 'role',
            'years', 'including', 'must', 'will', 'new', 'looking', 
            'seeking', 'ideal', 'competitive', 'excellent', 'strong', 
            'great', 'minimum', 'preferred', 'qualification', 'knowledge', 
            'related', 'professional', 'proficiency', 'based', 'remote', 
            'hybrid', 'office', 'requirements', 'skills'
        ]
        
        # Combine with standard stopwords
        self.custom_stopwords = list(set(stopwords.words('english')).union(additional_stopwords))

    def create_dual_tfidf_features(self, df, skill_categories):
        """
        Create TF-IDF features from both general text and skill-specific vocabulary with separate grid searches
        and cross validation using R-squared as the scoring metric
        """
        # Sample subset for initial tuning
        sample_size = int(len(df) * 0.2)
        df_sample = df.sample(n=sample_size, random_state=42)
        
        # Combine text features
        combined_text = df_sample['cleaned_title'] + ' ' + df_sample['cleaned_description']
        
        # 1. General TF-IDF Pipeline with custom stopwords
        general_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=self.custom_stopwords)),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # General TF-IDF parameter grid
        general_param_grid = {
            'tfidf__ngram_range': [(1, 2), (1, 3)],
            'tfidf__min_df': [0.01, 0.02, 0.05],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False, True],
            'rf__n_estimators': [100],
            'rf__max_depth': [10],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # 2. Create skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        # Remove any stopwords from skill vocabulary
        skill_vocabulary = skill_vocabulary - set(self.custom_stopwords)
        
        # Skill-specific TF-IDF Pipeline with custom stopwords
        skill_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                vocabulary=list(skill_vocabulary),
                stop_words=self.custom_stopwords
            )),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # Skill-specific parameter grid
        skill_param_grid = {
            'tfidf__ngram_range': [(1, 2), (1, 3)],
            'tfidf__min_df': [0.01, 0.02, 0.05],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False, True],
            'rf__n_estimators': [100],
            'rf__max_depth': [10],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # Tune general TF-IDF with cross validation
        print("Tuning general TF-IDF parameters...")
        general_grid = GridSearchCV(
            general_pipeline,
            general_param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'
        )
        general_grid.fit(combined_text, df_sample['salary'])
        
        # Print cross validation results for general TF-IDF
        print("\nGeneral TF-IDF Cross Validation Results:")
        cv_results = pd.DataFrame(general_grid.cv_results_)
        print(f"Best CV R-squared Score: {general_grid.best_score_:.3f}")
        print(f"Standard Deviation: {cv_results.loc[general_grid.best_index_, 'std_test_score']:.3f}")
        
        # Tune skill-specific TF-IDF with cross validation
        print("\nTuning skill-specific TF-IDF parameters...")
        skill_grid = GridSearchCV(
            skill_pipeline,
            skill_param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'
        )
        skill_grid.fit(combined_text, df_sample['salary'])
        
        # Print cross validation results for skill TF-IDF
        print("\nSkill TF-IDF Cross Validation Results:")
        cv_results = pd.DataFrame(skill_grid.cv_results_)
        print(f"Best CV R-squared Score: {skill_grid.best_score_:.3f}")
        print(f"Standard Deviation: {cv_results.loc[skill_grid.best_index_, 'std_test_score']:.3f}")
        
        # Process full dataset
        print("\nProcessing full dataset with tuned parameters...")
        combined_text_full = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # Generate general TF-IDF features
        self.best_general_tfidf = general_grid.best_estimator_.named_steps['tfidf']
        general_matrix = self.best_general_tfidf.fit_transform(combined_text_full)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.best_general_tfidf.get_feature_names_out()]
        )
        
        # Generate skill-specific TF-IDF features
        self.best_skill_tfidf = skill_grid.best_estimator_.named_steps['tfidf']
        skill_matrix = self.best_skill_tfidf.fit_transform(combined_text_full)
        skill_features = pd.DataFrame(
            skill_matrix.toarray(),
            columns=[f'skill_{f}' for f in self.best_skill_tfidf.get_feature_names_out()]
        )
        
        # Print best parameters for both grids
        print("\nBest General TF-IDF Parameters:")
        print(general_grid.best_params_)
        print("\nBest Skill TF-IDF Parameters:")
        print(skill_grid.best_params_)
        
        # Combine features
        feature_df = pd.concat([general_features, skill_features], axis=1)
        
        # Add category-level aggregated features
        for category, info in skill_categories.items():
            keyword_cols = [col for col in skill_features.columns 
                          if any(keyword in col for keyword in info['keywords'])]
            context_cols = [col for col in skill_features.columns 
                          if any(context in col for context in info['context_required'])]
            
            if keyword_cols:
                feature_df[f'{category}_keyword_score'] = skill_features[keyword_cols].sum(axis=1) * 2
            if context_cols:
                feature_df[f'{category}_context_score'] = skill_features[context_cols].sum(axis=1)
        
        # Add encoded categorical features
        feature_df['experience_level_encoded'] = self.label_encoder.fit_transform(df['experience_level'])
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'])
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'])
        
        # Add location features
        feature_df['state'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
        feature_df['state_encoded'] = self.label_encoder.fit_transform(feature_df['state'])
        
        # Add interaction features
        feature_df['exp_industry'] = feature_df['experience_level_encoded'] * feature_df['industry_encoded']
        
        return feature_df

    def train_model(self, feature_matrix, target):
        """
        Train the final model using cross validation with R-squared as the main metric
        """
        # Prepare features
        X = feature_matrix.drop(['state'], axis=1)
        y = target

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Define parameter grid for final model
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'max_features': ['sqrt', 'log2']
        }

        # Perform grid search with cross validation
        rf_grid = GridSearchCV(
            RandomForestRegressor(random_state=42, n_jobs=-1),
            param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'
        )
        
        rf_grid.fit(X_scaled, y)
        
        # Store best model
        self.best_rf = rf_grid.best_estimator_

        # Perform cross validation on best model
        r2_scores = cross_val_score(
            self.best_rf,
            X_scaled,
            y,
            cv=self.cv,
            scoring='r2',
            n_jobs=-1
        )
        
        # Also calculate RMSE for reference
        rmse_scores = np.sqrt(-cross_val_score(
            self.best_rf,
            X_scaled,
            y,
            cv=self.cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1
        ))
        
        # Calculate feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.best_rf.feature_importances_
        }).sort_values('importance', ascending=False)

        # Final evaluation metrics
        final_results = {
            'cv_r2_mean': r2_scores.mean(),
            'cv_r2_std': r2_scores.std(),
            'cv_rmse_mean': rmse_scores.mean(),
            'cv_rmse_std': rmse_scores.std(),
            'best_params': rf_grid.best_params_,
            'feature_importance': feature_importance,
            'cv_r2_scores': r2_scores,
            'cv_rmse_scores': rmse_scores
        }

        return final_results

    def save_model(self, path_prefix='models/'):
        """
        Save all model components
        """
        joblib.dump(self.best_general_tfidf, f'{path_prefix}general_tfidf.joblib')
        joblib.dump(self.best_skill_tfidf, f'{path_prefix}skill_tfidf.joblib')
        joblib.dump(self.best_rf, f'{path_prefix}random_forest.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')
        joblib.dump(self.custom_stopwords, f'{path_prefix}custom_stopwords.joblib')

    def load_model(self, path_prefix='models/'):
        """
        Load all model components
        """
        self.best_general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.best_skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.best_rf = joblib.load(f'{path_prefix}random_forest.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')
        self.custom_stopwords = joblib.load(f'{path_prefix}custom_stopwords.joblib')

# Example usage:
if __name__ == "__main__":
    # Initialize model
    model = SalaryPredictionModel()
    
    # Print the number of stopwords being used
    print(f"Number of custom stopwords: {len(model.custom_stopwords)}")
    
    # Create features
    feature_matrix = model.create_dual_tfidf_features(df, skill_categories)
    
    # Train model with cross validation
    results = model.train_model(feature_matrix, df['salary'])
    
    # Print results
    print(f"\nModel Performance:")
    print(f"Mean R-squared across folds: {results['cv_r2_mean']:.3f}")
    print(f"Standard Deviation of R-squared: {results['cv_r2_std']:.3f}")
    print(f"Mean RMSE across folds: ${results['cv_rmse_mean']:.2f}")
    print(f"Standard Deviation of RMSE: ${results['cv_rmse_std']:.2f}")
    
    print("\nTop 15 Most Important Features:")
    print(results['feature_importance'].head(15))

Number of custom stopwords: 228
Tuning general TF-IDF parameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

General TF-IDF Cross Validation Results:
Best CV R-squared Score: 0.451
Standard Deviation: 0.011

Tuning skill-specific TF-IDF parameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Skill TF-IDF Cross Validation Results:
Best CV R-squared Score: 0.280
Standard Deviation: 0.019

Processing full dataset with tuned parameters...

Best General TF-IDF Parameters:
{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__binary': False, 'tfidf__max_df': 0.7, 'tfidf__min_df': 0.05, 'tfidf__ngram_range': (1, 2)}

Best Skill TF-IDF Parameters:
{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__binary': True, 'tfidf__max_df': 0.7, 'tfidf__min_df': 0.01, 'tfidf__ngram_range': (1, 2

## TF-IDF * Random Forest with BERT embedding in Job Title

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

class SalaryPredictionModel:
    def __init__(self):
        self.best_general_tfidf = None
        self.best_skill_tfidf = None
        self.best_rf = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.cv = KFold(n_splits=5, shuffle=True, random_state=42)
        
        # Initialize BERT model and tokenizer
        self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
        self.bert_model.eval()  # Set to evaluation mode

    def get_bert_embeddings(self, texts):
        """
        Generate BERT embeddings for a list of texts
        """
        embeddings = []
        
        with torch.no_grad():
            for text in texts:
                # Tokenize and encode text
                encoded = self.bert_tokenizer(
                    text,
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_tensors='pt'
                )
                
                # Get BERT output
                outputs = self.bert_model(**encoded)
                
                # Use [CLS] token embedding (first token)
                embedding = outputs.last_hidden_state[:, 0, :].numpy()
                embeddings.append(embedding[0])
                
        return np.array(embeddings)

    def create_dual_tfidf_features(self, df, skill_categories):
        """
        Create TF-IDF features and BERT embeddings
        """
        # Get BERT embeddings for job titles
        print("Generating BERT embeddings for job titles...")
        bert_embeddings = self.get_bert_embeddings(df['cleaned_title'])
        bert_features = pd.DataFrame(
            bert_embeddings,
            columns=[f'bert_dim_{i}' for i in range(bert_embeddings.shape[1])]
        )
        
        # Generate TF-IDF features using full dataset
        print("Generating TF-IDF features...")
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # 1. General TF-IDF Pipeline
        general_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words="english")),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # General TF-IDF parameter grid
        general_param_grid = {
            'tfidf__ngram_range': [(1, 2), (1, 3)],
            'tfidf__min_df': [0.01, 0.02, 0.05],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False, True],
            'rf__n_estimators': [100],
            'rf__max_depth': [10],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # 2. Create skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        # Skill-specific TF-IDF Pipeline
        skill_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                vocabulary=list(skill_vocabulary),
                stop_words="english"
            )),
            ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # Skill-specific parameter grid
        skill_param_grid = {
            'tfidf__ngram_range': [(1, 2), (1, 3)],
            'tfidf__min_df': [0.01, 0.02, 0.03],
            'tfidf__max_df': [0.7, 0.8, 0.9],
            'tfidf__binary': [False, True],
            'rf__n_estimators': [100],
            'rf__max_depth': [10],
            'rf__min_samples_split': [5],
            'rf__min_samples_leaf': [2],
            'rf__max_features': ['sqrt']
        }
        
        # Tune general TF-IDF with cross validation
        print("Tuning general TF-IDF parameters...")
        general_grid = GridSearchCV(
            general_pipeline,
            general_param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'
        )
        general_grid.fit(combined_text, df['salary'])
        
        # Print cross validation results for general TF-IDF
        print("\nGeneral TF-IDF Cross Validation Results:")
        cv_results = pd.DataFrame(general_grid.cv_results_)
        print(f"Best CV R-squared Score: {general_grid.best_score_:.3f}")
        print(f"Standard Deviation: {cv_results.loc[general_grid.best_index_, 'std_test_score']:.3f}")
        
        # Tune skill-specific TF-IDF with cross validation
        print("\nTuning skill-specific TF-IDF parameters...")
        skill_grid = GridSearchCV(
            skill_pipeline,
            skill_param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'
        )
        skill_grid.fit(combined_text, df['salary'])
        
        # Print cross validation results for skill TF-IDF
        print("\nSkill TF-IDF Cross Validation Results:")
        cv_results = pd.DataFrame(skill_grid.cv_results_)
        print(f"Best CV R-squared Score: {skill_grid.best_score_:.3f}")
        print(f"Standard Deviation: {cv_results.loc[skill_grid.best_index_, 'std_test_score']:.3f}")
        
        # Generate features for full dataset using best parameters
        print("\nProcessing full dataset with tuned parameters...")
        
        # Generate general TF-IDF features
        self.best_general_tfidf = general_grid.best_estimator_.named_steps['tfidf']
        general_matrix = self.best_general_tfidf.fit_transform(combined_text)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.best_general_tfidf.get_feature_names_out()]
        )
        
        # Generate skill-specific TF-IDF features
        self.best_skill_tfidf = skill_grid.best_estimator_.named_steps['tfidf']
        skill_matrix = self.best_skill_tfidf.fit_transform(combined_text)
        skill_features = pd.DataFrame(
            skill_matrix.toarray(),
            columns=[f'skill_{f}' for f in self.best_skill_tfidf.get_feature_names_out()]
        )
        
        # Print best parameters
        print("\nBest General TF-IDF Parameters:")
        print(general_grid.best_params_)
        print("\nBest Skill TF-IDF Parameters:")
        print(skill_grid.best_params_)
        
        # Combine all features
        feature_df = pd.concat([
            general_features,
            skill_features,
            bert_features
        ], axis=1)
        
        # Add category-level aggregated features
        for category, info in skill_categories.items():
            keyword_cols = [col for col in skill_features.columns 
                          if any(keyword in col for keyword in info['keywords'])]
            context_cols = [col for col in skill_features.columns 
                          if any(context in col for context in info['context_required'])]
            
            if keyword_cols:
                feature_df[f'{category}_keyword_score'] = skill_features[keyword_cols].sum(axis=1) * 2
            if context_cols:
                feature_df[f'{category}_context_score'] = skill_features[context_cols].sum(axis=1)
        
        # Add encoded categorical features
        feature_df['experience_level_encoded'] = self.label_encoder.fit_transform(df['experience_level'])
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'])
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'])
        
        # Add location features
        feature_df['state'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
        feature_df['state_encoded'] = self.label_encoder.fit_transform(feature_df['state'])
        
        # Add interaction features
        feature_df['exp_industry'] = feature_df['experience_level_encoded'] * feature_df['industry_encoded']
        
        return feature_df

    def train_model(self, feature_matrix, target):
        """
        Train model with enhanced feature set including BERT embeddings
        """
        # Prepare features
        X = feature_matrix.drop(['state'], axis=1)
        y = target

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Enhanced parameter grid for final model
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'max_features': ['sqrt', 'log2'],
        }

        # Grid search with cross validation
        rf_grid = GridSearchCV(
            RandomForestRegressor(random_state=42, n_jobs=-1),
            param_grid,
            cv=self.cv,
            n_jobs=-1,
            verbose=1,
            scoring='r2'
        )
        
        rf_grid.fit(X_scaled, y)
        self.best_rf = rf_grid.best_estimator_

        # Cross validation metrics
        r2_scores = cross_val_score(
            self.best_rf,
            X_scaled,
            y,
            cv=self.cv,
            scoring='r2',
            n_jobs=-1
        )
        
        rmse_scores = np.sqrt(-cross_val_score(
            self.best_rf,
            X_scaled,
            y,
            cv=self.cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1
        ))
        
        # Calculate feature importance with BERT features
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.best_rf.feature_importances_
        }).sort_values('importance', ascending=False)

        # Aggregate importance of BERT features
        bert_importance = feature_importance[
            feature_importance['feature'].str.startswith('bert_')
        ]['importance'].sum()

        final_results = {
            'cv_r2_mean': r2_scores.mean(),
            'cv_r2_std': r2_scores.std(),
            'cv_rmse_mean': rmse_scores.mean(),
            'cv_rmse_std': rmse_scores.std(),
            'best_params': rf_grid.best_params_,
            'feature_importance': feature_importance,
            'bert_importance': bert_importance,
            'cv_r2_scores': r2_scores,
            'cv_rmse_scores': rmse_scores
        }

        return final_results

    def save_model(self, path_prefix='models/'):
        """
        Save all model components including BERT
        """
        joblib.dump(self.best_general_tfidf, f'{path_prefix}general_tfidf.joblib')
        joblib.dump(self.best_skill_tfidf, f'{path_prefix}skill_tfidf.joblib')
        joblib.dump(self.best_rf, f'{path_prefix}random_forest.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')
        # BERT models are saved separately using their own save methods
        self.bert_tokenizer.save_pretrained(f'{path_prefix}bert_tokenizer')
        self.bert_model.save_pretrained(f'{path_prefix}bert_model')

    def load_model(self, path_prefix='models/'):
        """
        Load all model components including BERT
        """
        self.best_general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.best_skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.best_rf = joblib.load(f'{path_prefix}random_forest.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')
        # Load BERT models
        self.bert_tokenizer = AutoTokenizer.from_pretrained(f'{path_prefix}bert_tokenizer')
        self.bert_model = AutoModel.from_pretrained(f'{path_prefix}bert_model')
        self.bert_model.eval()

# Example usage:
if __name__ == "__main__":
    # Initialize model
    model = SalaryPredictionModel()
    
    # Create features with BERT embeddings
    feature_matrix = model.create_dual_tfidf_features(df, skill_categories)
    
    # Train model
    results = model.train_model(feature_matrix, df['salary'])
    
    # Print results
    print(f"\nModel Performance:")
    print(f"Mean R-squared across folds: {results['cv_r2_mean']:.3f}")
    print(f"Standard Deviation of R-squared: {results['cv_r2_std']:.3f}")
    print(f"Mean RMSE across folds: ${results['cv_rmse_mean']:.2f}")
    print(f"Standard Deviation of RMSE: ${results['cv_rmse_std']:.2f}")
    print(f"\nBERT Features Total Importance: {results['bert_importance']:.3f}")
    print(f"\nBest Parameters:")
    print(results['best_params'])
    
    print("\nTop 15 Most Important Features:")
    print(results['feature_importance'].head(15))

Generating BERT embeddings for job titles...
Generating TF-IDF features...
Tuning general TF-IDF parameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


General TF-IDF Cross Validation Results:
Best CV R-squared Score: 0.488
Standard Deviation: 0.002

Tuning skill-specific TF-IDF parameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Skill TF-IDF Cross Validation Results:
Best CV R-squared Score: 0.296
Standard Deviation: 0.005

Processing full dataset with tuned parameters...

Best General TF-IDF Parameters:
{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__binary': False, 'tfidf__max_df': 0.9, 'tfidf__min_df': 0.05, 'tfidf__ngram_range': (1, 2)}

Best Skill TF-IDF Parameters:
{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__binary': True, 'tfidf__max_df': 0.7, 'tfidf__min_df': 0.01, 'tfidf__ngram_range': (1, 2)}
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Model Performance:
Mean R-squared across folds: 0.619
Standard Deviation of R-squared: 0.007
Mean RMSE across folds: $26256.66
Standard Deviation of RMSE: $207.74

BERT Features Total Importance: 0.528

Best Parameters:
{'max_depth': 20, 'max_features': 'sqrt', 'min_sa

## TF-IDF * Neural Network

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class SalaryDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets) if targets is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

class SalaryNeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(SalaryNeuralNet, self).__init__()
        
        # Process TF-IDF features
        self.tfidf_fc = nn.Linear(input_dim, 256)
        self.dropout = nn.Dropout(0.1)
        
        # Regressor matches the structure from the reference model
        self.regressor = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        # Process TF-IDF features
        x = self.tfidf_fc(x)
        x = self.dropout(x)
        x = torch.relu(x)
        
        # Final regression
        return self.regressor(x).squeeze()

class SalaryPredictionModel:
    def __init__(self):
        self.general_tfidf = None
        self.skill_tfidf = None
        self.neural_net = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.salary_scaler = StandardScaler()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

    def create_features(self, df, skill_categories):
        """Create TF-IDF and other features"""
        print("Generating TF-IDF features...")
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # General TF-IDF
        self.general_tfidf = TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            min_df=0.02,
            max_df=0.8
        )
        general_matrix = self.general_tfidf.fit_transform(combined_text)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.general_tfidf.get_feature_names_out()]
        )
        
        # Skill-specific TF-IDF
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        self.skill_tfidf = TfidfVectorizer(
            vocabulary=list(skill_vocabulary),
            stop_words="english",
            ngram_range=(1, 2)
        )
        skill_matrix = self.skill_tfidf.fit_transform(combined_text)
        skill_features = pd.DataFrame(
            skill_matrix.toarray(),
            columns=[f'skill_{f}' for f in self.skill_tfidf.get_feature_names_out()]
        )
        
        # Combine all features
        feature_df = pd.concat([
            general_features,
            skill_features
        ], axis=1)
        
        # Add encoded categorical features
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'].fillna('UNKNOWN'))
        feature_df['company_encoded'] = self.label_encoder.fit_transform(df['company'].fillna('UNKNOWN'))
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'].fillna('UNKNOWN'))
        
        return feature_df

    def prepare_data(self, feature_matrix, target, batch_size=16):
        """Prepare data for training"""
        # Scale features
        X_scaled = self.scaler.fit_transform(feature_matrix)
        y_scaled = self.salary_scaler.fit_transform(target.values.reshape(-1, 1)).ravel()
        
        # Create dataset
        dataset = SalaryDataset(X_scaled, y_scaled)
        
        # Split data
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = torch.utils.data.random_split(
            dataset, [train_size, val_size]
        )
        
        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        return train_loader, val_loader

    def train_model(self, feature_matrix, target, epochs=5, batch_size=16, learning_rate=2e-5):
        """Train neural network model"""
        # Prepare data
        train_loader, val_loader = self.prepare_data(feature_matrix, target, batch_size)
        
        # Initialize model
        self.neural_net = SalaryNeuralNet(input_dim=feature_matrix.shape[1]).to(self.device)
        
        optimizer = torch.optim.AdamW(self.neural_net.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()
        
        best_val_loss = float('inf')
        training_history = {'train_loss': [], 'val_loss': []}
        
        for epoch in range(epochs):
            # Training
            self.neural_net.train()
            train_loss = 0
            train_batches = 0
            
            for features, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
                features = features.to(self.device)
                labels = labels.to(self.device)
                
                optimizer.zero_grad()
                outputs = self.neural_net(features)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                train_batches += 1
            
            avg_train_loss = train_loss / train_batches
            training_history['train_loss'].append(avg_train_loss)
            
            # Validation
            self.neural_net.eval()
            val_loss = 0
            val_batches = 0
            val_preds = []
            val_true = []
            
            with torch.no_grad():
                for features, labels in val_loader:
                    features = features.to(self.device)
                    labels = labels.to(self.device)
                    
                    outputs = self.neural_net(features)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item()
                    val_batches += 1
                    
                    val_preds.extend(outputs.cpu().numpy())
                    val_true.extend(labels.cpu().numpy())
            
            avg_val_loss = val_loss / val_batches
            training_history['val_loss'].append(avg_val_loss)
            
            # Transform predictions back to original scale
            val_preds = self.salary_scaler.inverse_transform(np.array(val_preds).reshape(-1, 1))
            val_true = self.salary_scaler.inverse_transform(np.array(val_true).reshape(-1, 1))
            
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(val_true, val_preds))
            mae = mean_absolute_error(val_true, val_preds)
            r2 = r2_score(val_true, val_preds)
            
            print(f'Epoch {epoch + 1}:')
            print(f'Average training loss: {avg_train_loss:.4f}')
            print(f'Average validation loss: {avg_val_loss:.4f}')
            print(f'RMSE: ${rmse:,.2f}')
            print(f'MAE: ${mae:,.2f}')
            print(f'R2 Score: {r2:.4f}\n')
            
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(self.neural_net.state_dict(), 'best_salary_predictor.pth')
        
        return {
            'training_history': training_history,
            'final_metrics': {
                'RMSE': rmse,
                'MAE': mae,
                'R2': r2
            }
        }

    def save_model(self, path_prefix='models/'):
        """Save all model components"""
        joblib.dump(self.general_tfidf, f'{path_prefix}general_tfidf.joblib')
        joblib.dump(self.skill_tfidf, f'{path_prefix}skill_tfidf.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')
        joblib.dump(self.salary_scaler, f'{path_prefix}salary_scaler.joblib')
        torch.save(self.neural_net.state_dict(), f'{path_prefix}neural_net.pt')

    def load_model(self, path_prefix='models/'):
        """Load all model components"""
        self.general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')
        self.salary_scaler = joblib.load(f'{path_prefix}salary_scaler.joblib')
        self.neural_net.load_state_dict(torch.load(f'{path_prefix}neural_net.pt'))
        self.neural_net.eval()

# Example usage:
if __name__ == "__main__":
    # Load your data
    df = df
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis=1)
    
    # Initialize model
    model = SalaryPredictionModel()
    
    # Create features
    feature_matrix = model.create_features(df, skill_categories)
    
    # Train model
    print("Training model...")
    results = model.train_model(feature_matrix, df['salary'])
    
    # Print final results
    print("\nFinal Model Performance:")
    print(f"RMSE: ${results['final_metrics']['RMSE']:,.2f}")
    print(f"MAE: ${results['final_metrics']['MAE']:,.2f}")
    print(f"R2 Score: {results['final_metrics']['R2']:.4f}")

Using device: cpu
Generating TF-IDF features...
Training model...


Epoch 1/5: 100%|██████████| 1205/1205 [00:03<00:00, 355.42it/s]


Epoch 1:
Average training loss: 0.5830
Average validation loss: 0.3736
RMSE: $26,035.52
MAE: $19,675.51
R2 Score: 0.6301



Epoch 2/5: 100%|██████████| 1205/1205 [00:03<00:00, 329.47it/s]


Epoch 2:
Average training loss: 0.3084
Average validation loss: 0.3255
RMSE: $24,301.98
MAE: $18,126.24
R2 Score: 0.6777



Epoch 3/5: 100%|██████████| 1205/1205 [00:03<00:00, 397.90it/s]


Epoch 3:
Average training loss: 0.2388
Average validation loss: 0.3093
RMSE: $23,688.14
MAE: $17,275.88
R2 Score: 0.6938



Epoch 4/5: 100%|██████████| 1205/1205 [00:02<00:00, 415.10it/s]


Epoch 4:
Average training loss: 0.1930
Average validation loss: 0.3029
RMSE: $23,440.87
MAE: $16,985.59
R2 Score: 0.7002



Epoch 5/5: 100%|██████████| 1205/1205 [00:02<00:00, 420.30it/s]


Epoch 5:
Average training loss: 0.1572
Average validation loss: 0.3000
RMSE: $23,328.39
MAE: $16,706.25
R2 Score: 0.7030


Final Model Performance:
RMSE: $23,328.39
MAE: $16,706.25
R2 Score: 0.7030


## TF-IDF General Hyper-Parameter Tunning * Neural Network

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from itertools import product
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class SalaryDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets) if targets is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

class SalaryNeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout_rate=0.1):
        super(SalaryNeuralNet, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.BatchNorm1d(dim),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x).squeeze()

class SalaryPredictionModel:
    def __init__(self):
        self.general_tfidf = None
        self.skill_tfidf = None
        self.neural_net = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.salary_scaler = StandardScaler()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Best parameters storage
        self.best_general_tfidf_params = None
        self.best_skill_tfidf_params = None
        self.best_nn_params = None

    def prepare_data(self, feature_matrix, target, batch_size=16):
        """Prepare data for training"""
        # Scale features
        X_scaled = self.scaler.fit_transform(feature_matrix)
        y_scaled = self.salary_scaler.fit_transform(target.values.reshape(-1, 1)).ravel()
        
        # Create dataset
        dataset = SalaryDataset(X_scaled, y_scaled)
        
        # Split data
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = torch.utils.data.random_split(
            dataset, [train_size, val_size]
        )
        
        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        return train_loader, val_loader

    def tune_tfidf_parameters(self, df, skill_categories):
        """Tune TF-IDF parameters separately for general and skill-specific vectorizers"""
        print("Tuning TF-IDF parameters...")
        
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # TF-IDF parameter grid
        general_param_grid = {
            'ngram_range': [(1, 1), (1, 2), (1, 3)],
            'min_df': [0.01, 0.02, 0.05],
            'max_df': [0.7, 0.8, 0.9],
            'max_features': [None, 1000, 5000]
        }
        
        skill_param_grid = {
            'ngram_range': [(1, 1), (1, 2)],
            'min_df': [0.01, 0.02],
            'max_df': [0.8, 0.9]
        }
        
        # Tune general TF-IDF
        print("\nTuning general TF-IDF parameters...")
        best_general_score = float('-inf')
        best_general_params = None
        
        param_combinations = [dict(zip(general_param_grid.keys(), v)) 
                            for v in product(*general_param_grid.values())]
        
        for params in tqdm(param_combinations, desc="Testing general TF-IDF parameters"):
            try:
                general_tfidf = TfidfVectorizer(
                    stop_words="english",
                    **params
                )
                
                feature_matrix = general_tfidf.fit_transform(combined_text)
                n_features = feature_matrix.shape[1]
                sparsity = 1.0 - (feature_matrix.nnz / (feature_matrix.shape[0] * feature_matrix.shape[1]))
                score = n_features * (1 - sparsity)
                
                if score > best_general_score:
                    best_general_score = score
                    best_general_params = params
                    print(f"\nNew best general parameters found:")
                    print(f"Parameters: {params}")
                    print(f"Features: {n_features}")
                    print(f"Sparsity: {sparsity:.4f}")
                    print(f"Score: {score:.4f}")
            
            except Exception as e:
                print(f"Error with parameters {params}: {str(e)}")
                continue
        
        # Tune skill-specific TF-IDF
        print("\nTuning skill-specific TF-IDF parameters...")
        best_skill_score = float('-inf')
        best_skill_params = None
        
        # Create skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        param_combinations = [dict(zip(skill_param_grid.keys(), v)) 
                            for v in product(*skill_param_grid.values())]
        
        for params in tqdm(param_combinations, desc="Testing skill TF-IDF parameters"):
            try:
                skill_tfidf = TfidfVectorizer(
                    vocabulary=list(skill_vocabulary),
                    stop_words="english",
                    **params
                )
                
                feature_matrix = skill_tfidf.fit_transform(combined_text)
                n_features = feature_matrix.shape[1]
                sparsity = 1.0 - (feature_matrix.nnz / (feature_matrix.shape[0] * feature_matrix.shape[1]))
                score = n_features * (1 - sparsity)
                
                if score > best_skill_score:
                    best_skill_score = score
                    best_skill_params = params
                    print(f"\nNew best skill parameters found:")
                    print(f"Parameters: {params}")
                    print(f"Features: {n_features}")
                    print(f"Sparsity: {sparsity:.4f}")
                    print(f"Score: {score:.4f}")
            
            except Exception as e:
                print(f"Error with parameters {params}: {str(e)}")
                continue
        
        self.best_general_tfidf_params = best_general_params
        self.best_skill_tfidf_params = best_skill_params
        
        return {
            'general': best_general_params,
            'skill': best_skill_params
        }

    def tune_neural_network(self, feature_matrix, target, max_trials=10):
        """Tune neural network hyperparameters"""
        print("Tuning neural network hyperparameters...")
        
        # Neural network parameter grid
        nn_param_grid = {
            'hidden_dims': [
                [256, 128, 64],
                [512, 256, 128],
                [256, 256, 128],
                [512, 256, 128, 64]
            ],
            'dropout_rate': [0.1, 0.2, 0.3],
            'learning_rate': [1e-4, 2e-4, 5e-4],
            'batch_size': [16, 32, 64]
        }
        
        best_val_loss = float('inf')
        best_params = None
        
        # Generate parameter combinations
        param_combinations = [dict(zip(nn_param_grid.keys(), v)) 
                            for v in product(*nn_param_grid.values())]
        
        # Randomly sample from parameter combinations if there are too many
        if len(param_combinations) > max_trials:
            param_combinations = np.random.choice(
                param_combinations, 
                size=max_trials, 
                replace=False
            )
        
        for params in tqdm(param_combinations, desc="Testing NN parameters"):
            try:
                # Prepare data with current batch size
                train_loader, val_loader = self.prepare_data(
                    feature_matrix, 
                    target, 
                    batch_size=params['batch_size']
                )
                
                # Initialize model with current parameters
                self.neural_net = SalaryNeuralNet(
                    input_dim=feature_matrix.shape[1],
                    hidden_dims=params['hidden_dims'],
                    dropout_rate=params['dropout_rate']
                ).to(self.device)
                
                # Train for a few epochs to evaluate parameters
                results = self.train_model(
                    feature_matrix,
                    target,
                    epochs=5,
                    batch_size=params['batch_size'],
                    learning_rate=params['learning_rate'],
                    is_tuning=True
                )
                
                val_loss = min(results['training_history']['val_loss'])
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_params = params
                    print(f"\nNew best parameters found:")
                    print(f"Parameters: {params}")
                    print(f"Validation Loss: {val_loss:.4f}")
            
            except Exception as e:
                print(f"Error with parameters {params}: {str(e)}")
                continue
        
        self.best_nn_params = best_params
        return best_params

    def create_features(self, df, skill_categories):
        """Create TF-IDF and other features using best parameters"""
        print("Generating TF-IDF features...")
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # Use tuned parameters if available for general TF-IDF
        general_params = self.best_general_tfidf_params or {
            'ngram_range': (1, 2),
            'min_df': 0.02,
            'max_df': 0.8
        }
        
        # General TF-IDF
        self.general_tfidf = TfidfVectorizer(
            stop_words="english",
            **general_params
        )
        general_matrix = self.general_tfidf.fit_transform(combined_text)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.general_tfidf.get_feature_names_out()]
        )
        
        # Use tuned parameters if available for skill TF-IDF
        skill_params = self.best_skill_tfidf_params or {
            'ngram_range': (1, 2),
            'min_df': 0.02,
            'max_df': 0.8
        }
        
        # Skill-specific TF-IDF
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        self.skill_tfidf = TfidfVectorizer(
            vocabulary=list(skill_vocabulary),
            stop_words="english",
            **skill_params
        )
        skill_matrix = self.skill_tfidf.fit_transform(combined_text)
        skill_features = pd.DataFrame(
            skill_matrix.toarray(),
            columns=[f'skill_{f}' for f in self.skill_tfidf.get_feature_names_out()]
        )
        
        # Combine features
        feature_df = pd.concat([
            general_features,
            skill_features
        ], axis=1)
        
        # Add encoded categorical features
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'].fillna('UNKNOWN'))
        feature_df['company_encoded'] = self.label_encoder.fit_transform(df['company'].fillna('UNKNOWN'))
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'].fillna('UNKNOWN'))
        
        return feature_df

    def train_model(self, feature_matrix, target, epochs=10, batch_size=None, 
                   learning_rate=None, is_tuning=False):
        """Train neural network model using best parameters"""
        # Use default parameters if none provided and not in tuning mode
        if not is_tuning:
            if self.best_nn_params:
                batch_size = batch_size or self.best_nn_params.get('batch_size', 16)
                learning_rate = learning_rate or self.best_nn_params.get('learning_rate', 2e-5)
            else:
                batch_size = batch_size or 16
                learning_rate = learning_rate or 2e-5
        
        # Prepare data
        train_loader, val_loader = self.prepare_data(feature_matrix, target, batch_size)
        
        # Initialize model if not already initialized
        if not self.neural_net or is_tuning:
            hidden_dims = self.best_nn_params.get('hidden_dims', [256, 128, 64]) if self.best_nn_params else [256, 128, 64]
            dropout_rate = self.best_nn_params.get('dropout_rate', 0.1) if self.best_nn_params else 0.1
            
            self.neural_net = SalaryNeuralNet(
                input_dim=feature_matrix.shape[1],
                hidden_dims=hidden_dims,
                dropout_rate=dropout_rate
            ).to(self.device)
        
        optimizer = torch.optim.AdamW(self.neural_net.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
        
        best_val_loss = float('inf')
        training_history = {'train_loss': [], 'val_loss': []}
        
        for epoch in range(epochs):
            # Training
            self.neural_net.train()
            train_loss = 0
            train_batches = 0
            
            for features, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
                features = features.to(self.device)
                labels = labels.to(self.device)
                
                optimizer.zero_grad()
                outputs = self.neural_net(features)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                train_batches += 1
            
            avg_train_loss = train_loss / train_batches
            training_history['train_loss'].append(avg_train_loss)
            
            # Validation
            self.neural_net.eval()
            val_loss = 0
            val_batches = 0
            val_preds = []
            val_true = []
            
            with torch.no_grad():
                for features, labels in val_loader:
                    features = features.to(self.device)
                    labels = labels.to(self.device)
                    
                    outputs = self.neural_net(features)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item()
                    val_batches += 1
                    
                    val_preds.extend(outputs.cpu().numpy())
                    val_true.extend(labels.cpu().numpy())
            
            avg_val_loss = val_loss / val_batches
            training_history['val_loss'].append(avg_val_loss)
            
            # Update learning rate
            scheduler.step(avg_val_loss)
            
            if not is_tuning:
                # Transform predictions back to original scale
                val_preds = self.salary_scaler.inverse_transform(np.array(val_preds).reshape(-1, 1))
                val_true = self.salary_scaler.inverse_transform(np.array(val_true).reshape(-1, 1))
                
                # Calculate metrics
                rmse = np.sqrt(mean_squared_error(val_true, val_preds))
                mae = mean_absolute_error(val_true, val_preds)
                r2 = r2_score(val_true, val_preds)
                
                print(f'Epoch {epoch + 1}:')
                print(f'Average training loss: {avg_train_loss:.4f}')
                print(f'Average validation loss: {avg_val_loss:.4f}')
                print(f'RMSE: ${rmse:,.2f}')
                print(f'MAE: ${mae:,.2f}')
                print(f'R2 Score: {r2:.4f}\n')
            
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                if not is_tuning:
                    torch.save(self.neural_net.state_dict(), 'best_salary_predictor.pth')
        
        if not is_tuning:
            final_metrics = {
                'RMSE': rmse,
                'MAE': mae,
                'R2': r2
            }
        else:
            final_metrics = {}
        
        return {
            'training_history': training_history,
            'final_metrics': final_metrics
        }

    def save_model(self, path_prefix='models/'):
        """Save all model components"""
        import os
        os.makedirs(path_prefix, exist_ok=True)
        
        joblib.dump(self.general_tfidf, f'{path_prefix}general_tfidf.joblib')
        joblib.dump(self.skill_tfidf, f'{path_prefix}skill_tfidf.joblib')
        joblib.dump(self.scaler, f'{path_prefix}scaler.joblib')
        joblib.dump(self.label_encoder, f'{path_prefix}label_encoder.joblib')
        joblib.dump(self.salary_scaler, f'{path_prefix}salary_scaler.joblib')
        joblib.dump(self.best_general_tfidf_params, f'{path_prefix}best_general_tfidf_params.joblib')
        joblib.dump(self.best_skill_tfidf_params, f'{path_prefix}best_skill_tfidf_params.joblib')
        joblib.dump(self.best_nn_params, f'{path_prefix}best_nn_params.joblib')
        torch.save(self.neural_net.state_dict(), f'{path_prefix}neural_net.pt')

    def load_model(self, path_prefix='models/'):
        """Load all model components"""
        self.general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')
        self.salary_scaler = joblib.load(f'{path_prefix}salary_scaler.joblib')
        self.best_general_tfidf_params = joblib.load(f'{path_prefix}best_general_tfidf_params.joblib')
        self.best_skill_tfidf_params = joblib.load(f'{path_prefix}best_skill_tfidf_params.joblib')
        self.best_nn_params = joblib.load(f'{path_prefix}best_nn_params.joblib')
        
        # Initialize neural network with best parameters
        self.neural_net = SalaryNeuralNet(
            input_dim=self.scaler.n_features_in_,
            hidden_dims=self.best_nn_params['hidden_dims'],
            dropout_rate=self.best_nn_params['dropout_rate']
        ).to(self.device)
        self.neural_net.load_state_dict(torch.load(f'{path_prefix}neural_net.pt'))
        self.neural_net.eval()

    def predict(self, df):
        """Make predictions on new data"""
        # Create features
        features = self.create_features(df)
        
        # Scale features
        X_scaled = self.scaler.transform(features)
        
        # Convert to tensor
        X_tensor = torch.FloatTensor(X_scaled).to(self.device)
        
        # Make predictions
        self.neural_net.eval()
        with torch.no_grad():
            predictions = self.neural_net(X_tensor).cpu().numpy()
        
        # Scale back predictions
        predictions = self.salary_scaler.inverse_transform(predictions.reshape(-1, 1))
        
        return predictions.flatten()

# Example usage:
if __name__ == "__main__":
    # Load your data
    df = df  # Your DataFrame
    skill_categories = {
        # Your skill categories dictionary
    }
    
    # Initialize model
    model = SalaryPredictionModel()
    
    # Tune TF-IDF parameters
    print("Tuning TF-IDF parameters...")
    tfidf_params = model.tune_tfidf_parameters(df, skill_categories)
    print("\nBest general TF-IDF parameters:", tfidf_params['general'])
    print("Best skill TF-IDF parameters:", tfidf_params['skill'])
    
    # Create features using tuned TF-IDF parameters
    print("\nCreating features...")
    feature_matrix = model.create_features(df, skill_categories)
    
    # Tune neural network hyperparameters
    print("\nTuning neural network hyperparameters...")
    nn_params = model.tune_neural_network(feature_matrix, df['salary'])
    print("\nBest neural network parameters:", nn_params)
    
    # Train final model with best parameters
    print("\nTraining final model with best parameters...")
    results = model.train_model(
        feature_matrix,
        df['salary'],
        epochs=20  # Increase epochs for final training
    )
    
    # Print final results
    print("\nFinal Model Performance:")
    print(f"RMSE: ${results['final_metrics']['RMSE']:,.2f}")
    print(f"MAE: ${results['final_metrics']['MAE']:,.2f}")
    print(f"R2 Score: {results['final_metrics']['R2']:.4f}")
    
    # Save the model
    print("\nSaving model...")
    model.save_model()
    
    # Plot training history
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.plot(results['training_history']['train_loss'], label='Training Loss')
    plt.plot(results['training_history']['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training History')
    plt.legend()
    plt.grid(True)
    plt.show()

Using device: cpu
Tuning TF-IDF parameters...
Tuning TF-IDF parameters...

Tuning general TF-IDF parameters...


Testing general TF-IDF parameters:   1%|          | 1/81 [00:03<04:01,  3.02s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 1), 'min_df': 0.01, 'max_df': 0.7, 'max_features': None}
Features: 2598
Sparsity: 0.9250
Score: 194.9667


Testing general TF-IDF parameters:   5%|▍         | 4/81 [00:11<03:45,  2.93s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 1), 'min_df': 0.01, 'max_df': 0.8, 'max_features': None}
Features: 2602
Sparsity: 0.9239
Score: 197.9733


Testing general TF-IDF parameters:   9%|▊         | 7/81 [00:20<03:36,  2.93s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 1), 'min_df': 0.01, 'max_df': 0.9, 'max_features': None}
Features: 2604
Sparsity: 0.9234
Score: 199.5817


Testing general TF-IDF parameters:  35%|███▍      | 28/81 [01:30<04:08,  4.69s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 2), 'min_df': 0.01, 'max_df': 0.7, 'max_features': None}
Features: 4955
Sparsity: 0.9483
Score: 256.2993


Testing general TF-IDF parameters:  38%|███▊      | 31/81 [01:58<06:13,  7.46s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 2), 'min_df': 0.01, 'max_df': 0.8, 'max_features': None}
Features: 4959
Sparsity: 0.9477
Score: 259.3060


Testing general TF-IDF parameters:  42%|████▏     | 34/81 [02:27<06:53,  8.80s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 2), 'min_df': 0.01, 'max_df': 0.9, 'max_features': None}
Features: 4961
Sparsity: 0.9474
Score: 260.9144


Testing general TF-IDF parameters:  68%|██████▊   | 55/81 [05:39<04:32, 10.49s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 3), 'min_df': 0.01, 'max_df': 0.7, 'max_features': None}
Features: 5536
Sparsity: 0.9510
Score: 271.3669


Testing general TF-IDF parameters:  72%|███████▏  | 58/81 [06:44<06:32, 17.09s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 3), 'min_df': 0.01, 'max_df': 0.8, 'max_features': None}
Features: 5540
Sparsity: 0.9505
Score: 274.3736


Testing general TF-IDF parameters:  75%|███████▌  | 61/81 [07:47<06:17, 18.85s/it]


New best general parameters found:
Parameters: {'ngram_range': (1, 3), 'min_df': 0.01, 'max_df': 0.9, 'max_features': None}
Features: 5542
Sparsity: 0.9502
Score: 275.9820


Testing general TF-IDF parameters: 100%|██████████| 81/81 [14:48<00:00, 10.97s/it]



Tuning skill-specific TF-IDF parameters...


Testing skill TF-IDF parameters: 100%|██████████| 8/8 [00:00<00:00, 12048.27it/s]


Error with parameters {'ngram_range': (1, 1), 'min_df': 0.01, 'max_df': 0.8}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 1), 'min_df': 0.01, 'max_df': 0.9}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 1), 'min_df': 0.02, 'max_df': 0.8}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 1), 'min_df': 0.02, 'max_df': 0.9}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 2), 'min_df': 0.01, 'max_df': 0.8}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 2), 'min_df': 0.01, 'max_df': 0.9}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 2), 'min_df': 0.02, 'max_df': 0.8}: empty vocabulary passed to fit
Error with parameters {'ngram_range': (1, 2), 'min_df': 0.02, 'max_df': 0.9}: empty vocabulary passed to fit

Best general TF-IDF parameters: {'ngram_range': (1, 3), 'min_df': 0.01, 'max_df': 0.9, 'max_features': None}
Best skill TF-IDF 

ValueError: empty vocabulary passed to fit

## TF-IDF skills Hyper-Parameter Tunning * Neural Network

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from itertools import product
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

class SalaryDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets) if targets is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

class SalaryNeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout_rate=0.1):
        super(SalaryNeuralNet, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.BatchNorm1d(dim),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x).squeeze()

class SalaryPredictionModel:
    def __init__(self):
        self.general_tfidf = None
        self.skill_tfidf = None
        self.neural_net = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.salary_scaler = StandardScaler()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Best parameters storage
        self.best_skill_tfidf_params = None
        self.best_nn_params = None

    def prepare_data(self, feature_matrix, target, batch_size=16):
        """Prepare data for training"""
        # Scale features
        X_scaled = self.scaler.fit_transform(feature_matrix)
        y_scaled = self.salary_scaler.fit_transform(target.values.reshape(-1, 1)).ravel()
        
        # Create dataset
        dataset = SalaryDataset(X_scaled, y_scaled)
        
        # Split data
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = torch.utils.data.random_split(
            dataset, [train_size, val_size]
        )
        
        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        return train_loader, val_loader

    def create_features(self, df, skill_categories):
        """Create TF-IDF and other features"""
        print("Generating TF-IDF features...")
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
        
        # General TF-IDF with best parameters
        self.general_tfidf = TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 3),
            min_df=0.01,
            max_df=0.9
        )
        general_matrix = self.general_tfidf.fit_transform(combined_text)
        general_features = pd.DataFrame(
            general_matrix.toarray(),
            columns=[f'general_{f}' for f in self.general_tfidf.get_feature_names_out()]
        )
        
        # Prepare skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
        
        # Validate skill vocabulary against text
        valid_skill_terms = set()
        for term in skill_vocabulary:
            if any(term.lower() in text.lower() for text in combined_text):
                valid_skill_terms.add(term)
        
        if not valid_skill_terms:
            print("Warning: No skill terms found in text. Using general features only.")
            feature_df = general_features
        else:
            # Use tuned parameters if available for skill TF-IDF
            skill_params = self.best_skill_tfidf_params or {
                'ngram_range': (1, 2),
                'min_df': 0.01,
                'max_df': 0.9
            }
            
            # Skill-specific TF-IDF
            self.skill_tfidf = TfidfVectorizer(
                vocabulary=list(valid_skill_terms),
                stop_words="english",
                **skill_params
            )
            
            try:
                skill_matrix = self.skill_tfidf.fit_transform(combined_text)
                skill_features = pd.DataFrame(
                    skill_matrix.toarray(),
                    columns=[f'skill_{f}' for f in self.skill_tfidf.get_feature_names_out()]
                )
                
                # Combine features
                feature_df = pd.concat([
                    general_features,
                    skill_features
                ], axis=1)
            except ValueError as e:
                print(f"Warning: Error in skill TF-IDF creation: {str(e)}")
                print("Proceeding with general features only.")
                feature_df = general_features
        
        # Add encoded categorical features
        feature_df['work_type_encoded'] = self.label_encoder.fit_transform(df['work_type'].fillna('UNKNOWN'))
        feature_df['company_encoded'] = self.label_encoder.fit_transform(df['company'].fillna('UNKNOWN'))
        feature_df['industry_encoded'] = self.label_encoder.fit_transform(df['industry'].fillna('UNKNOWN'))
        
        print(f"Created {feature_df.shape[1]} features")
        return feature_df

    def tune_neural_network(self, feature_matrix, target, max_trials=10):
        """Tune neural network hyperparameters"""
        print("Tuning neural network hyperparameters...")
        
        nn_param_grid = {
            'hidden_dims': [
                [256, 128, 64],
                [512, 256, 128],
                [256, 256, 128],
                [512, 256, 128, 64]
            ],
            'dropout_rate': [0.1, 0.2, 0.3],
            'learning_rate': [1e-4, 2e-4, 5e-4],
            'batch_size': [16, 32, 64]
        }
        
        best_val_loss = float('inf')
        best_params = None
        
        # Generate parameter combinations
        param_combinations = [dict(zip(nn_param_grid.keys(), v)) 
                            for v in product(*nn_param_grid.values())]
        
        # Randomly sample from parameter combinations if there are too many
        if len(param_combinations) > max_trials:
            param_combinations = np.random.choice(
                param_combinations, 
                size=max_trials, 
                replace=False
            )
        
        for params in tqdm(param_combinations, desc="Testing NN parameters"):
            try:
                # Prepare data with current batch size
                train_loader, val_loader = self.prepare_data(
                    feature_matrix, 
                    target, 
                    batch_size=params['batch_size']
                )
                
                # Initialize model with current parameters
                self.neural_net = SalaryNeuralNet(
                    input_dim=feature_matrix.shape[1],
                    hidden_dims=params['hidden_dims'],
                    dropout_rate=params['dropout_rate']
                ).to(self.device)
                
                # Train for a few epochs to evaluate parameters
                results = self.train_model(
                    feature_matrix,
                    target,
                    epochs=5,
                    batch_size=params['batch_size'],
                    learning_rate=params['learning_rate'],
                    is_tuning=True
                )
                
                val_loss = min(results['training_history']['val_loss'])
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_params = params
                    print(f"\nNew best parameters found:")
                    print(f"Parameters: {params}")
                    print(f"Validation Loss: {val_loss:.4f}")
            
            except Exception as e:
                print(f"Error with parameters {params}: {str(e)}")
                continue
        
        self.best_nn_params = best_params
        return best_params

    def train_model(self, feature_matrix, target, epochs=10, batch_size=None, 
                   learning_rate=None, is_tuning=False):
        """Train neural network model"""
        # Use default parameters if none provided and not in tuning mode
        if not is_tuning:
            if self.best_nn_params:
                batch_size = batch_size or self.best_nn_params.get('batch_size', 16)
                learning_rate = learning_rate or self.best_nn_params.get('learning_rate', 2e-5)
            else:
                batch_size = batch_size or 16
                learning_rate = learning_rate or 2e-5
        
        # Prepare data
        train_loader, val_loader = self.prepare_data(feature_matrix, target, batch_size)
        
        # Initialize model if not already initialized
        if not self.neural_net or is_tuning:
            hidden_dims = self.best_nn_params.get('hidden_dims', [256, 128, 64]) if self.best_nn_params else [256, 128, 64]
            dropout_rate = self.best_nn_params.get('dropout_rate', 0.1) if self.best_nn_params else 0.1
            
            self.neural_net = SalaryNeuralNet(
                input_dim=feature_matrix.shape[1],
                hidden_dims=hidden_dims,
                dropout_rate=dropout_rate
            ).to(self.device)
        
        optimizer = torch.optim.AdamW(self.neural_net.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
        
        best_val_loss = float('inf')
        training_history = {'train_loss': [], 'val_loss': []}
        
        for epoch in range(epochs):
            # Training
            self.neural_net.train()
            train_loss = 0
            train_batches = 0
            
            for features, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
                features = features.to(self.device)
                labels = labels.to(self.device)
                
                optimizer.zero_grad()
                outputs = self.neural_net(features)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                train_batches += 1
            
            avg_train_loss = train_loss / train_batches
            training_history['train_loss'].append(avg_train_loss)
            
            # Validation
            self.neural_net.eval()
            val_loss = 0
            val_batches = 0
            val_preds = []
            val_true = []
            
            with torch.no_grad():
                for features, labels in val_loader:
                    features = features.to(self.device)
                    labels = labels.to(self.device)
                    
                    outputs = self.neural_net(features)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item()
                    val_batches += 1
                    
                    val_preds.extend(outputs.cpu().numpy())
                    val_true.extend(labels.cpu().numpy())
            
            avg_val_loss = val_loss / val_batches
            training_history['val_loss'].append(avg_val_loss)
            
            # Update learning rate
            scheduler.step(avg_val_loss)
            
            if not is_tuning:
                # Transform predictions back to original scale
                val_preds = self.salary_scaler.inverse_transform(np.array(val_preds).reshape(-1, 1))
                val_true = self.salary_scaler.inverse_transform(np.array(val_true).reshape(-1, 1))
                
                # Calculate metrics
                rmse = np.sqrt(mean_squared_error(val_true, val_preds))
                mae = mean_absolute_error(val_true, val_preds)
                r2 = r2_score(val_true, val_preds)
                
                print(f'Epoch {epoch + 1}:')
                print(f'Average training loss: {avg_train_loss:.4f}')
                print(f'Average validation loss: {avg_val_loss:.4f}')
                print(f'RMSE: ${rmse:,.2f}')
                print(f'MAE: ${mae:,.2f}')
                print(f'R2 Score: {r2:.4f}\n')
            
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                if not is_tuning:
                    torch.save(self.neural_net.state_dict(), 'best_salary_predictor.pth')
        
        if not is_tuning:
            final_metrics = {
                'RMSE': rmse,
                'MAE': mae,
                'R2': r2
            }
        else:
            final_metrics = {}
        
        return {
            'training_history': training_history,
            'final_metrics': final_metrics
        }

    def save_model(self, path_prefix='models/'):
        """Save all model components"""
        import os
        os.makedirs(path_prefix, exist_ok=True)
        
        joblib.dump(self.best_nn_params, f'{path_prefix}best_nn_params.joblib')
        torch.save(self.neural_net.state_dict(), f'{path_prefix}neural_net.pt')

    def load_model(self, path_prefix='models/'):
        """Load all model components"""
        self.general_tfidf = joblib.load(f'{path_prefix}general_tfidf.joblib')
        self.skill_tfidf = joblib.load(f'{path_prefix}skill_tfidf.joblib')
        self.scaler = joblib.load(f'{path_prefix}scaler.joblib')
        self.label_encoder = joblib.load(f'{path_prefix}label_encoder.joblib')
        self.salary_scaler = joblib.load(f'{path_prefix}salary_scaler.joblib')
        self.best_skill_tfidf_params = joblib.load(f'{path_prefix}best_skill_tfidf_params.joblib')
        self.best_nn_params = joblib.load(f'{path_prefix}best_nn_params.joblib')
        
        # Initialize neural network with best parameters
        self.neural_net = SalaryNeuralNet(
            input_dim=self.scaler.n_features_in_,
            hidden_dims=self.best_nn_params['hidden_dims'],
            dropout_rate=self.best_nn_params['dropout_rate']
        ).to(self.device)
        self.neural_net.load_state_dict(torch.load(f'{path_prefix}neural_net.pt'))
        self.neural_net.eval()

    def predict(self, df):
        """Make predictions on new data"""
        # Create features
        features = self.create_features(df)
        
        # Scale features
        X_scaled = self.scaler.transform(features)
        
        # Convert to tensor
        X_tensor = torch.FloatTensor(X_scaled).to(self.device)
        
        # Make predictions
        self.neural_net.eval()
        with torch.no_grad():
            predictions = self.neural_net(X_tensor).cpu().numpy()
        
        # Scale back predictions
        predictions = self.salary_scaler.inverse_transform(predictions.reshape(-1, 1))
        
        return predictions.flatten()

    def tune_skill_tfidf(self, df, skill_categories):
        """Tune skill-specific TF-IDF parameters"""
        print("Tuning skill-specific TF-IDF parameters...")
        combined_text = df['cleaned_title'] + ' ' + df['cleaned_description']
    
        # Skill-specific parameter grid
        skill_param_grid = {
            'ngram_range': [(1, 1), (1, 2), (1, 3)],
            'min_df': [0.005, 0.01, 0.02],  # Lower values for skill terms
            'max_df': [0.8, 0.9, 0.95]
        }
    
        best_skill_score = float('-inf')
        best_skill_params = None
        best_feature_stats = None
    
        # Create skill vocabulary
        skill_vocabulary = set()
        for category in skill_categories.values():
            skill_vocabulary.update(category['keywords'])
            skill_vocabulary.update(category['context_required'])
    
        # Test different parameter combinations
        param_combinations = [dict(zip(skill_param_grid.keys(), v)) 
                            for v in product(*skill_param_grid.values())]
    
        for params in tqdm(param_combinations, desc="Testing skill TF-IDF parameters"):
            try:
                skill_tfidf = TfidfVectorizer(
                    vocabulary=list(skill_vocabulary),
                    stop_words="english",
                    **params
                )
            
                feature_matrix = skill_tfidf.fit_transform(combined_text)
                n_features = feature_matrix.shape[1]
                sparsity = 1.0 - (feature_matrix.nnz / (feature_matrix.shape[0] * feature_matrix.shape[1]))
            
                # Calculate feature frequency statistics
                feature_frequencies = np.asarray(feature_matrix.sum(axis=0)).ravel()
                avg_freq = np.mean(feature_frequencies)
                max_freq = np.max(feature_frequencies)
            
                # Score based on feature coverage and sparsity
                score = n_features * (1 - sparsity) * avg_freq
            
                if score > best_skill_score:
                    best_skill_score = score
                    best_skill_params = params
                    best_feature_stats = {
                        'n_features': n_features,
                        'sparsity': sparsity,
                        'avg_frequency': avg_freq,
                        'max_frequency': max_freq
                    }
                    print(f"\nNew best skill parameters found:")
                    print(f"Parameters: {params}")
                    print(f"Features: {n_features}")
                    print(f"Sparsity: {sparsity:.4f}")
                    print(f"Average Frequency: {avg_freq:.4f}")
                    print(f"Score: {score:.4f}")
        
            except Exception as e:
                print(f"Error with parameters {params}: {str(e)}")
                continue
    
        self.best_skill_tfidf_params = best_skill_params
        return best_skill_params, best_feature_stats
    

# Example usage:
if __name__ == "__main__":

    skill_categories = {
            'Creative Arts': {
                'keywords': [
                    'adobe creative suite', 'photoshop', 'illustrator', 'indesign',
                    'graphic design', 'visual design', 'typography', 'art direction',
                    'creative direction', 'brand design', 'illustration', 'adobe xd',
                    'figma', 'sketch', 'color theory', 'layout design'
                ],
                'context_required': ['design', 'creative', 'art', 'visual']
            },
            'Digital Design': {
                'keywords': [
                    'ui design', 'ux design', 'user interface', 'user experience',
                    'wireframing', 'prototyping', 'responsive design', 'mobile design',
                    'web design', 'interaction design', 'usability testing',
                    'information architecture', 'figma', 'sketch', 'principle'
                ],
                'context_required': ['design', 'user', 'interface', 'experience']
            },
            'Product Management': {
                'keywords': [
                    'product strategy', 'product roadmap', 'product development',
                    'product lifecycle', 'agile product', 'product owner', 'scrum',
                    'market research', 'user stories', 'feature prioritization',
                    'product metrics', 'product analytics', 'product launch'
                ],
                'context_required': ['product']
            },
            'Data Analysis': {
                'keywords': [
                    'data analysis', 'statistical analysis', 'data visualization',
                    'sql', 'python', 'r programming', 'tableau', 'power bi',
                    'excel advanced', 'data modeling', 'regression analysis',
                    'hypothesis testing', 'a/b testing', 'data mining'
                ],
                'context_required': ['data', 'analysis', 'analytics']
            },
            'Software Development': {
                'keywords': [
                    'java ', 'python', 'javascript', 'react', 'angular', 'node.js',
                    'full stack', 'front end', 'back end', 'web development',
                    'api development', 'cloud computing', 'aws', 'azure',
                    'devops', 'ci/cd', 'docker', 'kubernetes'
                ],
                'context_required': ['development', 'programming', 'software']
            },
            'Marketing': {
                'keywords': [
                    'digital marketing', 'content marketing', 'seo', 'sem',
                    'social media marketing', 'email marketing', 'marketing automation',
                    'google analytics', 'conversion optimization', 'brand marketing',
                    'marketing strategy', 'campaign management', 'hubspot', 'marketo'
                ],
                'context_required': ['marketing', 'digital']
            },
            'Project Management': {
                'keywords': [
                    'project management', 'agile methodology', 'scrum master',
                    'project planning', 'risk management', 'stakeholder management',
                    'pmp certification', 'project coordination', 'jira', 'asana',
                    'microsoft project', 'project lifecycle', 'change management'
                ],
                'context_required': ['project', 'management']
            },
            'Business Analysis': {
                'keywords': [
                    'business analysis', 'requirements gathering', 'process mapping',
                    'gap analysis', 'business process', 'system analysis',
                    'functional requirements', 'business intelligence', 'data modeling',
                    'process improvement', 'workflow optimization'
                ],
                'context_required': ['analysis', 'business']
            },
            'Financial': {
                'keywords': [
                    'financial analysis', 'financial modeling', 'forecasting',
                    'budgeting', 'variance analysis', 'cost analysis', 'pricing',
                    'profit and loss', 'balance sheet', 'financial reporting',
                    'risk assessment', 'investment analysis'
                ],
                'context_required': ['financial', 'finance']
            },
            'Sales': {
                'keywords': [
                    'sales strategy', 'account management', 'sales forecasting',
                    'crm', 'salesforce', 'sales operations', 'business development',
                    'lead generation', 'pipeline management', 'contract negotiation',
                    'sales analytics', 'territory management'
                ],
                'context_required': ['sales', 'revenue']
            }
        }

    # Initialize model
    model = SalaryPredictionModel()
    
    # Tune skill-specific TF-IDF parameters
    print("\nTuning skill-specific TF-IDF parameters...")
    best_skill_params, skill_stats = model.tune_skill_tfidf(df, skill_categories)
    
    print("\nBest Skill TF-IDF Parameters:")
    print(f"N-gram Range: {best_skill_params['ngram_range']}")
    print(f"Min Document Frequency: {best_skill_params['min_df']}")
    print(f"Max Document Frequency: {best_skill_params['max_df']}")
    print("\nFeature Statistics:")
    print(f"Number of Features: {skill_stats['n_features']}")
    print(f"Sparsity: {skill_stats['sparsity']:.4f}")
    print(f"Average Feature Frequency: {skill_stats['avg_frequency']:.4f}")
    print(f"Maximum Feature Frequency: {skill_stats['max_frequency']:.4f}")
    
    # Create features using tuned parameters
    print("\nCreating features...")
    feature_matrix = model.create_features(df, skill_categories)
    
    # Tune neural network hyperparameters
    print("\nTuning neural network hyperparameters...")
    nn_params = model.tune_neural_network(feature_matrix, df['salary'], max_trials=10)
    print("\nBest neural network parameters:", nn_params)
    
    # Train final model with best parameters
    print("\nTraining final model with best parameters...")
    results = model.train_model(
        feature_matrix,
        df['salary'],
        epochs=20
    )
    
    # Print final results
    print("\nFinal Model Performance:")
    print(f"RMSE: ${results['final_metrics']['RMSE']:,.2f}")
    print(f"MAE: ${results['final_metrics']['MAE']:,.2f}")
    print(f"R2 Score: {results['final_metrics']['R2']:.4f}")
    
    # Save the model
    print("\nSaving model...")
    model.save_model()

Using device: cpu

Tuning skill-specific TF-IDF parameters...
Tuning skill-specific TF-IDF parameters...


Testing skill TF-IDF parameters:   4%|▎         | 1/27 [00:03<01:28,  3.40s/it]


New best skill parameters found:
Parameters: {'ngram_range': (1, 1), 'min_df': 0.005, 'max_df': 0.8}
Features: 157
Sparsity: 0.9629
Average Frequency: 298.0522
Score: 1735.5362


Testing skill TF-IDF parameters:  37%|███▋      | 10/27 [00:36<01:12,  4.26s/it]


New best skill parameters found:
Parameters: {'ngram_range': (1, 2), 'min_df': 0.005, 'max_df': 0.8}
Features: 157
Sparsity: 0.9589
Average Frequency: 311.7437
Score: 2009.7927


Testing skill TF-IDF parameters:  70%|███████   | 19/27 [01:37<00:58,  7.29s/it]


New best skill parameters found:
Parameters: {'ngram_range': (1, 3), 'min_df': 0.005, 'max_df': 0.8}
Features: 157
Sparsity: 0.9589
Average Frequency: 311.8313
Score: 2011.6264


Testing skill TF-IDF parameters: 100%|██████████| 27/27 [02:54<00:00,  6.45s/it]



Best Skill TF-IDF Parameters:
N-gram Range: (1, 3)
Min Document Frequency: 0.005
Max Document Frequency: 0.8

Feature Statistics:
Number of Features: 157
Sparsity: 0.9589
Average Feature Frequency: 311.8313
Maximum Feature Frequency: 8695.6333

Creating features...
Generating TF-IDF features...
Created 5692 features

Tuning neural network hyperparameters...
Tuning neural network hyperparameters...


Epoch 1/5: 100%|██████████| 1205/1205 [00:06<00:00, 185.70it/s]
Epoch 2/5: 100%|██████████| 1205/1205 [00:06<00:00, 184.21it/s]
Epoch 3/5: 100%|██████████| 1205/1205 [00:06<00:00, 184.90it/s]
Epoch 4/5: 100%|██████████| 1205/1205 [00:06<00:00, 186.18it/s]
Epoch 5/5: 100%|██████████| 1205/1205 [00:06<00:00, 185.26it/s]
Testing NN parameters:  10%|█         | 1/10 [00:37<05:39, 37.70s/it]


New best parameters found:
Parameters: {'hidden_dims': [256, 128, 64], 'dropout_rate': 0.2, 'learning_rate': 0.0005, 'batch_size': 16}
Validation Loss: 0.3253


Epoch 1/5: 100%|██████████| 603/603 [00:03<00:00, 169.74it/s]
Epoch 2/5: 100%|██████████| 603/603 [00:03<00:00, 165.68it/s]
Epoch 3/5: 100%|██████████| 603/603 [00:03<00:00, 163.85it/s]
Epoch 4/5: 100%|██████████| 603/603 [00:03<00:00, 161.11it/s]
Epoch 5/5: 100%|██████████| 603/603 [00:03<00:00, 164.32it/s]
Testing NN parameters:  20%|██        | 2/10 [01:01<03:55, 29.39s/it]


New best parameters found:
Parameters: {'hidden_dims': [256, 256, 128], 'dropout_rate': 0.1, 'learning_rate': 0.0001, 'batch_size': 32}
Validation Loss: 0.3022


Epoch 1/5: 100%|██████████| 302/302 [00:02<00:00, 130.54it/s]
Epoch 2/5: 100%|██████████| 302/302 [00:02<00:00, 121.05it/s]
Epoch 3/5: 100%|██████████| 302/302 [00:02<00:00, 129.09it/s]
Epoch 4/5: 100%|██████████| 302/302 [00:02<00:00, 130.56it/s]
Epoch 5/5: 100%|██████████| 302/302 [00:02<00:00, 131.17it/s]
Testing NN parameters:  30%|███       | 3/10 [01:17<02:43, 23.32s/it]


New best parameters found:
Parameters: {'hidden_dims': [256, 128, 64], 'dropout_rate': 0.1, 'learning_rate': 0.0005, 'batch_size': 64}
Validation Loss: 0.2904


Epoch 1/5: 100%|██████████| 1205/1205 [00:06<00:00, 180.41it/s]
Epoch 2/5: 100%|██████████| 1205/1205 [00:06<00:00, 185.58it/s]
Epoch 3/5: 100%|██████████| 1205/1205 [00:06<00:00, 184.14it/s]
Epoch 4/5: 100%|██████████| 1205/1205 [00:06<00:00, 182.95it/s]
Epoch 5/5: 100%|██████████| 1205/1205 [00:06<00:00, 187.98it/s]
Epoch 1/5: 100%|██████████| 603/603 [00:03<00:00, 164.70it/s].07s/it]
Epoch 2/5: 100%|██████████| 603/603 [00:03<00:00, 165.07it/s]
Epoch 3/5: 100%|██████████| 603/603 [00:03<00:00, 166.73it/s]
Epoch 4/5: 100%|██████████| 603/603 [00:03<00:00, 166.49it/s]
Epoch 5/5: 100%|██████████| 603/603 [00:03<00:00, 165.08it/s]
Testing NN parameters:  50%|█████     | 5/10 [02:18<02:14, 26.86s/it]


New best parameters found:
Parameters: {'hidden_dims': [256, 128, 64], 'dropout_rate': 0.1, 'learning_rate': 0.0005, 'batch_size': 32}
Validation Loss: 0.2889


Epoch 1/5: 100%|██████████| 603/603 [00:04<00:00, 130.29it/s]
Epoch 2/5: 100%|██████████| 603/603 [00:04<00:00, 149.34it/s]
Epoch 3/5: 100%|██████████| 603/603 [00:04<00:00, 149.88it/s]
Epoch 4/5: 100%|██████████| 603/603 [00:04<00:00, 143.49it/s]
Epoch 5/5: 100%|██████████| 603/603 [00:03<00:00, 156.68it/s]
Epoch 1/5: 100%|██████████| 1205/1205 [00:07<00:00, 170.99it/s]9s/it]
Epoch 2/5: 100%|██████████| 1205/1205 [00:06<00:00, 183.36it/s]
Epoch 3/5: 100%|██████████| 1205/1205 [00:06<00:00, 182.40it/s]
Epoch 4/5: 100%|██████████| 1205/1205 [00:06<00:00, 175.64it/s]
Epoch 5/5: 100%|██████████| 1205/1205 [00:06<00:00, 180.55it/s]
Epoch 1/5: 100%|██████████| 1205/1205 [00:06<00:00, 181.01it/s]2s/it]
Epoch 2/5: 100%|██████████| 1205/1205 [00:06<00:00, 186.26it/s]
Epoch 3/5: 100%|██████████| 1205/1205 [00:06<00:00, 173.07it/s]
Epoch 4/5: 100%|██████████| 1205/1205 [00:07<00:00, 163.60it/s]
Epoch 5/5: 100%|██████████| 1205/1205 [00:06<00:00, 180.67it/s]
Epoch 1/5: 100%|██████████| 1205/1205 


Best neural network parameters: {'hidden_dims': [256, 128, 64], 'dropout_rate': 0.1, 'learning_rate': 0.0005, 'batch_size': 32}

Training final model with best parameters...


Epoch 1/20: 100%|██████████| 603/603 [00:04<00:00, 136.44it/s]


Epoch 1:
Average training loss: 0.3513
Average validation loss: 0.2882
RMSE: $22,844.13
MAE: $17,271.08
R2 Score: 0.7059



Epoch 2/20: 100%|██████████| 603/603 [00:04<00:00, 137.25it/s]


Epoch 2:
Average training loss: 0.2307
Average validation loss: 0.2819
RMSE: $22,605.23
MAE: $16,241.75
R2 Score: 0.7120



Epoch 3/20: 100%|██████████| 603/603 [00:03<00:00, 153.87it/s]


Epoch 3:
Average training loss: 0.1736
Average validation loss: 0.2963
RMSE: $23,170.63
MAE: $16,442.15
R2 Score: 0.6975



Epoch 4/20: 100%|██████████| 603/603 [00:03<00:00, 160.73it/s]


Epoch 4:
Average training loss: 0.1483
Average validation loss: 0.2827
RMSE: $22,634.44
MAE: $16,246.93
R2 Score: 0.7113



Epoch 5/20: 100%|██████████| 603/603 [00:03<00:00, 168.38it/s]


Epoch 5:
Average training loss: 0.1362
Average validation loss: 0.2838
RMSE: $22,674.00
MAE: $15,865.74
R2 Score: 0.7103



Epoch 6/20: 100%|██████████| 603/603 [00:03<00:00, 158.74it/s]


Epoch 6:
Average training loss: 0.1229
Average validation loss: 0.2900
RMSE: $22,917.12
MAE: $16,805.92
R2 Score: 0.7040



Epoch 7/20: 100%|██████████| 603/603 [00:03<00:00, 161.25it/s]


Epoch 7:
Average training loss: 0.0992
Average validation loss: 0.2595
RMSE: $21,686.96
MAE: $15,330.32
R2 Score: 0.7350



Epoch 8/20: 100%|██████████| 603/603 [00:03<00:00, 166.43it/s]


Epoch 8:
Average training loss: 0.0869
Average validation loss: 0.2670
RMSE: $21,989.79
MAE: $15,804.66
R2 Score: 0.7275



Epoch 9/20: 100%|██████████| 603/603 [00:03<00:00, 163.56it/s]


Epoch 9:
Average training loss: 0.0849
Average validation loss: 0.2658
RMSE: $21,948.12
MAE: $15,378.99
R2 Score: 0.7285



Epoch 10/20: 100%|██████████| 603/603 [00:03<00:00, 169.19it/s]


Epoch 10:
Average training loss: 0.0811
Average validation loss: 0.2661
RMSE: $21,963.55
MAE: $15,264.52
R2 Score: 0.7282



Epoch 11/20: 100%|██████████| 603/603 [00:03<00:00, 167.69it/s]


Epoch 11:
Average training loss: 0.0873
Average validation loss: 0.2728
RMSE: $22,239.07
MAE: $15,550.90
R2 Score: 0.7213



Epoch 12/20: 100%|██████████| 603/603 [00:03<00:00, 168.04it/s]


Epoch 12:
Average training loss: 0.0751
Average validation loss: 0.2642
RMSE: $21,882.62
MAE: $15,603.60
R2 Score: 0.7302



Epoch 13/20: 100%|██████████| 603/603 [00:03<00:00, 161.63it/s]


Epoch 13:
Average training loss: 0.0669
Average validation loss: 0.2610
RMSE: $21,749.36
MAE: $14,950.20
R2 Score: 0.7334



Epoch 14/20: 100%|██████████| 603/603 [00:03<00:00, 157.04it/s]


Epoch 14:
Average training loss: 0.0646
Average validation loss: 0.2624
RMSE: $21,804.11
MAE: $15,502.49
R2 Score: 0.7321



Epoch 15/20: 100%|██████████| 603/603 [00:03<00:00, 153.37it/s]


Epoch 15:
Average training loss: 0.0661
Average validation loss: 0.2662
RMSE: $21,961.15
MAE: $15,197.88
R2 Score: 0.7282



Epoch 16/20: 100%|██████████| 603/603 [00:03<00:00, 165.14it/s]


Epoch 16:
Average training loss: 0.0628
Average validation loss: 0.2645
RMSE: $21,895.74
MAE: $15,014.50
R2 Score: 0.7298



Epoch 17/20: 100%|██████████| 603/603 [00:03<00:00, 162.98it/s]


Epoch 17:
Average training loss: 0.0624
Average validation loss: 0.2573
RMSE: $21,592.36
MAE: $14,960.95
R2 Score: 0.7373



Epoch 18/20: 100%|██████████| 603/603 [00:03<00:00, 162.06it/s]


Epoch 18:
Average training loss: 0.0591
Average validation loss: 0.2635
RMSE: $21,851.43
MAE: $15,540.71
R2 Score: 0.7309



Epoch 19/20: 100%|██████████| 603/603 [00:04<00:00, 149.97it/s]


Epoch 19:
Average training loss: 0.0572
Average validation loss: 0.2623
RMSE: $21,803.67
MAE: $14,768.12
R2 Score: 0.7321



Epoch 20/20: 100%|██████████| 603/603 [00:03<00:00, 160.38it/s]


Epoch 20:
Average training loss: 0.0565
Average validation loss: 0.2621
RMSE: $21,797.11
MAE: $15,314.66
R2 Score: 0.7323


Final Model Performance:
RMSE: $21,797.11
MAE: $15,314.66
R2 Score: 0.7323

Saving model...
