In [16]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump
from custom_transformers import TextSelector, NumberSelector
import csv
import lightgbm as lgb
import xgboost as xgb

# Function to convert pay range strings to average values
def convert_pay_range_to_avg(pay_str):
    # implementation omitted for brevity

# Load the data
    data = []
with open('output1.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) == 9:
            data.append(row)

df = pd.DataFrame(data, columns=['count_apps_id', 'job_id', 'job_title', 'job_description', 'min_pay', 'max_pay', 'commission', 'state', 'zip_code'])

# Convert columns to appropriate data types
df['count_apps_id'] = pd.to_numeric(df['count_apps_id'], errors='coerce')
for col in ['min_pay', 'max_pay', 'commission']:
    df[col] = df[col].replace('[\$,]', '', regex=True).apply(convert_pay_range_to_avg)
    df[col] = df[col].fillna(0)

# Define the numeric and text columns
numeric_columns = ['min_pay', 'max_pay', 'commission']
text_columns = ['job_title', 'job_description']

# Remove rows with missing values in the target variable
df = df.dropna(subset=['count_apps_id'])

# Split the data into training and test sets
X = df[numeric_columns + text_columns]
y = df['count_apps_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipelines for text feature extraction
text_pipelines = []
for col in text_columns:
    text_pipelines.append(
        (col, Pipeline([
            ('selector', TextSelector(col)),  # Select the text column
            ('tfidf', TfidfVectorizer(stop_words='english', max_features=1000))  # Convert text to TF-IDF features
        ]))
    )

# Create pipelines for numeric feature preprocessing
numeric_pipelines = []
for col in numeric_columns:
    numeric_pipelines.append(
        (col, Pipeline([
            ('selector', NumberSelector(col)),  # Select the numeric column
            ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by imputing with column mean
            ('scaler', StandardScaler())  # Scale the numeric features
        ]))
    )

# Create a feature union to combine the pipelines
preprocessor = FeatureUnion(text_pipelines + numeric_pipelines)

# Define the LightGBM model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMRegressor(random_state=42))
])

# Define the hyperparameter search space for LightGBM
param_dist = {
    'classifier__boosting_type': ['gbdt', 'dart'],
    'classifier__num_leaves': [10, 20, 30],
    'classifier__learning_rate': [0.01, 0.1, 1.0],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

# Perform random search for hyperparameter tuning
random_search = RandomizedSearchCV(model, param_distributions=param_dist, cv=5, n_iter=50, random_state=42)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

dump(best_model, 'job_posting_pipeline.pkl')
print("Pipeline created with the best model")

y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Train RMSE: {rmse_train}")
print(f"Test RMSE: {rmse_test}")
print(f"Train R^2: {r2_train}")
print(f"Test R^2: {r2_test}")


Pipeline created with the best model
Train RMSE: 9.580224819234159
Test RMSE: 10.656783826045933
Train R^2: 0.9794506600871795
Test R^2: 0.973003709980117
