In [103]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/salary_data_cleaned.csv')

df_model=df[['Rating','Location','Size','Type of ownership','Industry','Sector','Revenue','hourly','employer_provided','avg_salary',
       'same_state','age','python_yn','spark','aws','excel','Job Title','math', 'engineering', 'data analysis', 'AI']]

df_dum = pd.get_dummies(df_model)

# 1. Standardization
We try to bring all the variables or features to a similar scale. standarisation means centering the variable at zero.

In [104]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
df_stand=pd.DataFrame(scaler.fit_transform(df_dum),columns=df_dum.columns)

In [105]:
features=list(df_stand.columns)
X = df_stand.drop('avg_salary', axis=1)
y = df_stand['avg_salary'].values

In [106]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

Gradient Boosting Algorithm applied


In [108]:
gbm = GradientBoostingRegressor()
gbm = gbm.fit(X_train, y_train)
Y_pred = gbm.predict(X_test)
n_estimators=150
max_depth=5
loss='ls'
gbm_best_params = GradientBoostingRegressor(n_estimators=150, max_depth=5, loss='ls')
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

print("Mean Squared Error :",mean_squared_error(y_test,Y_pred))
print("Mean Absolute Error :",mean_absolute_error(y_test,Y_pred))
print("Root Mean Squared Error :",mean_squared_error(y_test,Y_pred,squared=False))

coeffofdet = r2_score(y_test, gbm.predict(X_test))
print("The Coefficient Of Determination on test set: {:.4f}".format(coeffofdet))
mape = (100 / len(y_test)) * np.sum(np.abs((y_test - gbm.predict(X_test)) / y_test))

Mean Squared Error : 0.5140278845190754
Mean Absolute Error : 0.5669597311795299
Root Mean Squared Error : 0.7169573798483948
The Coefficient Of Determination on test set: 0.5258


Decision Tree Algorithm applied

In [109]:
# Train scikit-learn Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
sklearn_regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=5, min_samples_leaf=2)
sklearn_regressor.fit(X_train, y_train)

# Make predictions using scikit-learn's model
sklearn_predictions = sklearn_regressor.predict(X_test)

# Calculate performance metrics for scikit-learn's model
mse_sklearn = mean_squared_error(y_test, sklearn_predictions)
r2_sklearn = r2_score(y_test, sklearn_predictions)
mae_sklearn = mean_absolute_error(y_test, sklearn_predictions)

print("Scikit-learn Mean Squared Error: ", mse_sklearn)
print("Scikit-learn R-squared: ", r2_sklearn)
print("Scikit-learn Mean Absolute Error: ", mae_sklearn)

Scikit-learn Mean Squared Error:  0.6602588400777225
Scikit-learn R-squared:  0.3909462096110573
Scikit-learn Mean Absolute Error:  0.5867432880874167


Linear Regression Algorithm applied

In [110]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
# Fit the model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred_lr)
mae = mean_absolute_error(y_test, y_pred_lr)
rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
r2 = r2_score(y_test, y_pred_lr)

def adjusted_r2_score(y_true, y_pred, X):
    r2 = r2_score(y_true, y_pred)
    n = X.shape[0]
    p = X.shape[1]
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

adjusted_r2 = adjusted_r2_score(y_test, y_pred_lr, X_test)

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred_lr)
explained_variance = explained_variance_score(y_test, y_pred_lr)

print("Mean Squared Error :", mse)
print("Mean Absolute Error :", mae)
print("Root Mean Squared Error :", rmse)
print("R-squared (R²) Score:", r2)
print("Adjusted R-squared (Adjusted R²) Score:", adjusted_r2)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Explained Variance Score:", explained_variance)#write linear regression code

Mean Squared Error : 2.418923738183627e+27
Mean Absolute Error : 20023330069236.793
Root Mean Squared Error : 49182555222188.555
R-squared (R²) Score: -2.2313289606680678e+27
Adjusted R-squared (Adjusted R²) Score: 7.37135460220701e+26
Mean Absolute Percentage Error (MAPE): 2.84127247590694e+16
Explained Variance Score: -2.2169614862976435e+27


Random Forest Algorithm applied

In [111]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train and evaluate scikit-learn's RandomForestRegressor
sklearn_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1)
sklearn_regressor.fit(X_train, y_train)
predictions_sklearn = sklearn_regressor.predict(X_test)

mse_sklearn = mean_squared_error(y_test, predictions_sklearn)
r2_sklearn = r2_score(y_test, predictions_sklearn)
mae_sklearn = mean_absolute_error(y_test, predictions_sklearn)

print("Mean Squared Error (Scikit-learn): ", mse_sklearn)
print("R-squared (Scikit-learn): ", r2_sklearn)
print("Mean Absolute Error (Scikit-learn): ", mae_sklearn)

Mean Squared Error (Scikit-learn):  0.5017567774047497
R-squared (Scikit-learn):  0.5371559628103875
Mean Absolute Error (Scikit-learn):  0.5410440065032274


# 2.Guassian Transformation (square root transfromation)


In [112]:
import scipy.stats as stat
import pylab

df = pd.read_csv('/content/salary_data_cleaned.csv')

features = ['Rating','Location','Size','Type of ownership','Industry','Sector','Revenue','hourly','employer_provided','avg_salary',
       'same_state','age','python_yn','spark','aws','excel','Job Title','math', 'engineering', 'data analysis', 'AI']

# Select the relevant features from the dataframe
df_model = df[features].copy()

# Separate numerical and categorical features
numerical_features = df_model.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df_model.select_dtypes(exclude=[np.number]).columns.tolist()

# Create a new dataframe for the transformed numerical features
transformed_numerical = pd.DataFrame()

# Apply square root transformation to numerical features
for feature in numerical_features:
    transformed_numerical[feature + '_sqroot'] = np.sqrt(df_model[feature].abs())  # Ensure non-negative values

# Combine transformed numerical features with categorical features
df_transformed = pd.concat([transformed_numerical, df_model[categorical_features]], axis=1)

# Define X and y
X = df_transformed.drop('avg_salary_sqroot', axis=1)
y = df_transformed['avg_salary_sqroot'].values

Gradient Boosting Algorithm applied

In [113]:
gbm = GradientBoostingRegressor()
gbm = gbm.fit(X_train, y_train)
Y_pred = gbm.predict(X_test)
n_estimators=150
max_depth=5
loss='ls'
gbm_best_params = GradientBoostingRegressor(n_estimators=150, max_depth=5, loss='ls')
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

print("Mean Squared Error :",mean_squared_error(y_test,Y_pred))
print("Mean Absolute Error :",mean_absolute_error(y_test,Y_pred))
print("Root Mean Squared Error :",mean_squared_error(y_test,Y_pred,squared=False))

coeffofdet = r2_score(y_test, gbm.predict(X_test))
print("The Coefficient Of Determination on test set: {:.4f}".format(coeffofdet))
mape = (100 / len(y_test)) * np.sum(np.abs((y_test - gbm.predict(X_test)) / y_test))

Mean Squared Error : 0.5122576029218844
Mean Absolute Error : 0.5693811052959401
Root Mean Squared Error : 0.7157217356779689
The Coefficient Of Determination on test set: 0.5275


Decision Tree algorithm applied

In [114]:

# Train scikit-learn Decision Tree Regressor
sklearn_regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=5, min_samples_leaf=2)
sklearn_regressor.fit(X_train, y_train)

# Make predictions using scikit-learn's model
sklearn_predictions = sklearn_regressor.predict(X_test)

# Calculate performance metrics for scikit-learn's model
mse_sklearn = mean_squared_error(y_test, sklearn_predictions)
r2_sklearn = r2_score(y_test, sklearn_predictions)
mae_sklearn = mean_absolute_error(y_test, sklearn_predictions)

print("Scikit-learn Mean Squared Error: ", mse_sklearn)
print("Scikit-learn R-squared: ", r2_sklearn)
print("Scikit-learn Mean Absolute Error: ", mae_sklearn)

Scikit-learn Mean Squared Error:  0.6517994396029174
Scikit-learn R-squared:  0.3987495582538283
Scikit-learn Mean Absolute Error:  0.5804057512069669


Linear Regression algorithm applied

In [115]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

# Fit the model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred_lr)
mae = mean_absolute_error(y_test, y_pred_lr)
rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
r2 = r2_score(y_test, y_pred_lr)

def adjusted_r2_score(y_true, y_pred, X):
    r2 = r2_score(y_true, y_pred)
    n = X.shape[0]
    p = X.shape[1]
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

adjusted_r2 = adjusted_r2_score(y_test, y_pred_lr, X_test)

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred_lr)
explained_variance = explained_variance_score(y_test, y_pred_lr)

print("Mean Squared Error :", mse)
print("Mean Absolute Error :", mae)
print("Root Mean Squared Error :", rmse)
print("R-squared (R²) Score:", r2)
print("Adjusted R-squared (Adjusted R²) Score:", adjusted_r2)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Explained Variance Score:", explained_variance)

Mean Squared Error : 2.418923738183627e+27
Mean Absolute Error : 20023330069236.793
Root Mean Squared Error : 49182555222188.555
R-squared (R²) Score: -2.2313289606680678e+27
Adjusted R-squared (Adjusted R²) Score: 7.37135460220701e+26
Mean Absolute Percentage Error (MAPE): 2.84127247590694e+16
Explained Variance Score: -2.2169614862976435e+27


Random Forest algorithm applied

In [117]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train and evaluate scikit-learn's RandomForestRegressor
sklearn_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1)
sklearn_regressor.fit(X_train, y_train)
predictions_sklearn = sklearn_regressor.predict(X_test)

mse_sklearn = mean_squared_error(y_test, predictions_sklearn)
r2_sklearn = r2_score(y_test, predictions_sklearn)
mae_sklearn = mean_absolute_error(y_test, predictions_sklearn)

print("Mean Squared Error (Scikit-learn): ", mse_sklearn)
print("R-squared (Scikit-learn): ", r2_sklearn)
print("Mean Absolute Error (Scikit-learn): ", mae_sklearn)

Mean Squared Error (Scikit-learn):  0.5017567774047497
R-squared (Scikit-learn):  0.5371559628103875
Mean Absolute Error (Scikit-learn):  0.5410440065032274
