In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import joblib
from sklearn.impute import SimpleImputer# from sklearn.experimental import enable_iterative_imputer


In [28]:
def load_data(filePath): 
    df = pd.read_csv(filePath)
    print("Data loaded successfully. Shape", df.shape)
    print("\n Data preview:")
    print(df.head())
    print("\n Data missing")
    print(df.isnull().sum())
    return df

In [29]:
df = load_data('Salary_Data.csv')

Data loaded successfully. Shape (6704, 6)

 Data preview:
    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  

 Data missing
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64


In [30]:
# def handle_missing_values(df): 
#     df['Age'] = df['Age'].fillna(df['Age'].mean())
#     df['Years of Experience'] = df['Years of Experience'].fillna(df['Years of Experience'].mean())    
#     df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
#     df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
#     df['Education Level'] = df['Education Level'].fillna(df['Education Level'].mode()[0])
#     df['Job Title'] = df['Job Title'].fillna(df['Job Title'].mode()[0])

In [31]:
# d = handle_missing_values(df)

In [32]:
# print(d)

In [38]:
from sklearn.model_selection import train_test_split


def preprocessing_data(df, target_col='Salary'): 


    # Drop rows where target is missing (or impute if appropriate)
    df = df.dropna(subset=[target_col])

    # sparate features and target 

    X = df.drop(columns=[target_col])
    y = df[target_col]

    # create preprocessing pipline 
    preprocessor  = ColumnTransformer(
        transformers=[
             ('categorical', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), ['Gender', 'Education Level', 'Job Title']),
            # scale numerical features
            ('mumerical', Pipeline([
            ('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())

            ]), ['Age', 'Years of Experience'])

])

    # create pipeline 
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # train the model
    pipeline.fit(X_train, y_train)

    # evaluate the model
    train_pred = pipeline.predict(X_train)
    test_pred = pipeline.predict(X_test)
    
    print(f"Train R²: {r2_score(y_train, train_pred):.3f}")
    print(f"Test R²: {r2_score(y_test, test_pred):.3f}")

    joblib.dump(pipeline, 'salary_model.pkl')


In [39]:
preprocessing_data(df)

Train R²: 0.889
Test R²: 0.878
