<a href="https://colab.research.google.com/github/kitty6878/DataScienceSalaryModel/blob/main/DataWaveAlgorithim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge  # Using Ridge regression for regularization
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
data = pd.read_csv("jobs_in_data.csv")

# Define features and target variable
categorical_features = ['job_title', 'job_category', 'employee_residence',
                        'experience_level', 'employment_type',
                        'work_setting', 'company_location', 'company_size']
target_variable = 'salary_in_usd'

# Prepare the dataset
X = data[categorical_features]
y = data[target_variable]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for categorical features
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')

# Create preprocessing and training pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_preprocessor, categorical_features)],
    remainder='passthrough')

# Include Ridge regression in the pipeline for regularization
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),  # Scale data to improve regularization
    ('regressor', Ridge(alpha=1.0))  # Adjust alpha as needed
])
#Create and train the linear regression model
# Train the model

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
print(y_pred)

#Evaluate the model
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Coefficient of Determination (R^2):', r2_score(y_test, y_pred))

#Function to get user input and predict salary
def predict_salary_in_usd():
  try:
        # Get user input
        user_input = {
            'job_title': str(input("Enter job title: ")),
            'job_category': str(input("Enter job category: ")),
            'employee_residence': str(input("Enter country of residence: ")),
            'experience_level': str(input("Enter experience level: ")),
            'employment_type': str(input("Enter type of employment: ")),
            'work_setting': str(input("Enter work setting: ")),
            'company_location': str(input("Enter company location: ")),
            'company_size': str(input("Enter company size (S, M, L): "))
        }

        # Create a DataFrame from the user input
        user_df = pd.DataFrame([user_input])

        # Perform one-hot encoding for categorical features
        #user_df = pd.get_dummies(user_df, columns=categorical_features, drop_first=True)

        # Reindex the user input DataFrame to match the columns of the training data
        #user_df = user_df.reindex(columns=X_train.columns, fill_value=0)

        # Make prediction using the trained model
        predicted_salary_in_usd = model.predict(user_df)
        print("Predicted Salary:", predicted_salary_in_usd[0])

  except Exception as e:
    print("Error:", e)

#Make predictions based on user input
predict_salary_in_usd()

[166966.77258031  79811.74643595 190131.88084779 ... 129517.99606432
 130984.52298398 199028.85673848]
Mean Squared Error: 2674340773.273064
Coefficient of Determination (R^2): 0.3551357108184494
Enter job title: Data DevOps Engineer


KeyboardInterrupt: Interrupted by user