In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, salaries, y=None):
        self.feature_names_in = salaries.columns.tolist()
        return self
    
    def transform(self, salaries):
        salaries = salaries.copy()
        
        # Convert to numeric (your existing code)
        salaries['YearsCode'] = pd.to_numeric(salaries['YearsCode'], errors='coerce').fillna(0)
        salaries['YearsCodePro_B'] = pd.to_numeric(salaries['YearsCodePro_B'], errors='coerce').fillna(0)
        
        # Your feature engineering logic here
        skill_columns = [col for col in salaries.columns if col.endswith('_Bucket')]
        
        salaries['skill_diversity'] = X[skill_columns].apply(
            lambda row: len([val for val in row if val not in ['None', 'Other', 'Not Specified']]), axis=1
        ).fillna(0)
        
        salaries['experience_consistency'] = salaries['YearsCodePro_B'] / (salaries['YearsCode'] + 1)
        salaries['experience_consistency'] = salaries['experience_consistency'].clip(0, 1).fillna(0)
        
        seniority_keywords = ['Senior', 'Lead', 'Staff', 'Principal', 'Manager', 'Director']
        salaries['is_senior_role'] = salaries['DevType_Bucket'].str.contains('|'.join(seniority_keywords), case=False, na=False)
        
        salaries['professional_experience_factor'] = (
            salaries['YearsCodePro_B'] * 0.7 + 
            salaries['experience_consistency'] * 0.2 + 
            salaries['is_senior_role'].astype(int) * 0.1
        )
        
        salaries['experience_skill_ratio'] = salaries['skill_diversity'] / (salaries['YearsCodePro_B'] + 1)
        
        salaries['senior_experience_match'] = (
            salaries['is_senior_role'] & (salaries['YearsCodePro_B'] >= 5)
        ).astype(int)
        
        return salaries
    
    def get_feature_names_out(self, input_features=None):
        if input_features is None: 
            input_features = getattr(self, 'feature_names_in', [])

        if hasattr(input_features, 'tolist'):
            input_features = input_features.tolist()

        engineered_features = [
            'skill_diversity', 'experience_consistency', 'is_senior_role',
            'professional_experience_factor', 'experience_skill_ratio', 'senior_experience_match'
        ]

        return input_features + engineered_features

In [None]:
class OrgSizeBinner(BaseEstimator, TransformerMixin):
    def fit(self, salaries, y=None):
        return self
    
    def transform(self, salaries):
        salaries = salaries.copy()
        
        orgsize_bins = {
            'large_enterprise': ['10,000 or more employees'],
            'enterprise': ['5,000 to 9,999 employees'], 
            'mid_company': ['1,000 to 4,999 employees'],
            'small_company': ['500 to 999 employees', '100 to 499 employees'],
            'startup': ['20 to 99 employees', '2 to 19 employees'],
            'freelancer': ['Just me - I am a freelancer, sole proprietor, etc.'],
            'other': ['I don\'t know', 'Not specified']
        }
        
        salaries['OrgSize_Binned'] = 'other'  # default
        for bin_name, categories in orgsize_bins.items():
            salaries.loc[salaries['OrgSize'].isin(categories), 'OrgSize_Binned'] = bin_name
            
        return salaries

In [None]:
# UPDATED PIPELINE with CustomTransformer for Feature Engineering. 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

ordinal_cols = ['EdLevel_Bucket']

onehot_cols = ['Employment_Category_Bucket']

target_cols = ['DevType_Bucket', 'PlatformHaveWorkedWith_Bucket', 'WebframeHaveWorkedWith_Bucket', 'LanguageHaveWorkedWith_Bucket', 'ToolsTechHaveWorkedWith_Bucket', 'YearsCodePro_B']

numerical_cols = ['Age_Encoded']

edlevel_order = ['Masters', 'Bachelors', 'No_Degree', 'Associates']

all_ordinal_orders = [edlevel_order]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('ordinal', OrdinalEncoder(categories=all_ordinal_orders), ordinal_cols),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True), onehot_cols),
        ('target', CustomTargetEncoder(), target_cols),
    ],
    remainder='drop'
)

new_pipeline = Pipeline(steps=[
    ('orgsize_binning', OrgSizeBinner()),
    ('feature_engineering', AdvancedFeatureEngineer()),
    ('preprocessor', preprocessor)
])

In [None]:
def bucketize_professional_tech_multiple(tech_string):
    """
    Applies hierarchical keyword-based bucketing to a semi-colon separated string of technologies.
    It iterates through each technology and assigns the first bucket that matches.
    """
    if pd.isna(tech_string):
        return 'None'
    
    lower_tech_string = str(tech_string).lower()

    if 'none of these' in lower_tech_string:
        return 'None'

    buckets = {

        'AI/ML': ['ai', 'machine learning', 'ml', 'deep learning', 'neural network', 'nlp', 'natural language', 'computer vision', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'openai'],
        'Data Science & Analytics': ['data science', 'data analysis', 'analytics', 'big data', 'hadoop', 'spark', 'pandas', 'numpy', 'tableau', 'power bi', 'databricks', 'snowflake'],
        

        'DevOps & Cloud': ['devops', 'ci/cd', 'continuous integration', 'continuous delivery', 'docker', 'kubernetes', 'terraform', 'ansible', 'jenkins', 'aws', 'azure', 'gcp', 'cloud', 'observability'],
        

        'Web Development': ['web', 'frontend', 'backend', 'full-stack', 'javascript', 'react', 'angular', 'vue', 'node.js', 'django', 'flask', 'ruby on rails', 'php', 'asp.net'],
        'Mobile Development': ['mobile', 'ios', 'android', 'swift', 'kotlin', 'react native', 'flutter', 'xamarin'],
        'Databases': ['database', 'sql', 'nosql', 'postgresql', 'mysql', 'sql server', 'mongodb', 'redis', 'cassandra', 'firebase'],
        'Testing & QA': ['testing', 'qa', 'quality assurance', 'selenium', 'jest', 'pytest', 'cypress', 'junit'],
        'Security': ['security', 'cybersecurity', 'infosec', 'penetration testing', 'pen testing'],
        'Developer Tools': ['git', 'github', 'gitlab', 'jira', 'visual studio code', 'ide'],
        

        'Architecture & Practices': ['microservices', 'developer portal', 'innersource']
    }

    listed_techs = [tech.strip().lower() for tech in str(tech_string).split(';')]

    for tech in listed_techs:
        for bucket, keywords in buckets.items():
            if any(keyword in tech for keyword in keywords):
                return bucket

    return 'Other'

In [None]:
from sklearn.impute import SimpleImputer


top_10_countries = features['Country'].value_counts().nlargest(10).index

# .where() keeps the value if the condition is true, otherwise replaces it with 'Other'
features['Country_Grouped'] = features['Country'].where(features['Country'].isin(top_10_countries), 'Other')

imputer = SimpleImputer(strategy='most_frequent')
features['Country_Grouped'] = imputer.fit_transform(features[['Country_Grouped']]).ravel()


print(features['Country_Grouped'].value_counts())

In [None]:
# Custom Transformer for highly sensitive traget encoding 

from sklearn.base import BaseEstimator, TransformerMixin

class CustomTargetEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass

    def fit(self, X, y=None):

        if y is None:
            raise ValueError("Target encoder requires y during fit")

        if not isinstance(y, pd.Series):
            y = pd.Series(y, name="target")
        else:   
            y = y.copy()

        self.global_mean_ = y.mean()
        
        self.encodings_ = {}
        
        for col in X.columns:
            salary = pd.concat([X[[col]], y], axis=1)
            self.encodings_[col] = salary.groupby(col)[y.name].mean()
            
        return self

    def transform(self, X):
        
        X_new = X.copy()
        for col in X.columns:
            X_new[col] = X_new[col].map(self.encodings_[col]).fillna(self.global_mean_)
        return X_new
    
    def get_feature_names_out(self, input_features=None):

        if input_features is None:
            return self.feature_names_in_
        return input_features
