In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('salaries.csv')
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [3]:
# columns to be used for X
X = df.drop('salary_more_then_100k', axis = 1)
# columns for Y to predict
y = df[df.columns[-1]]

In [4]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=100)

In [5]:
# Double check to select categorical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Keep selected columns only
X_train = X_train[categorical_cols].copy()
X_valid = X_valid[categorical_cols].copy()

In [6]:
categorical_transformer = Pipeline(steps=[
        ('imputer' , SimpleImputer(strategy = 'constant')),
        ('ohe' , OneHotEncoder(handle_unknown = 'ignore'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_transformer, categorical_cols)
    ], remainder = 'passthrough')

In [7]:
model = RandomForestRegressor(n_estimators=100, random_state=10)

In [8]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model train', model)
                             ])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
y_preds = pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, y_preds)
print('MAE:', score)

MAE: 0.39
