In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')

In [2]:
dfc = pd.read_csv('Salary_prediction_data.csv')
original = dfc.copy()

In [3]:
dfc = dfc.drop(columns=['Unnamed: 0','StudentId'])

In [4]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CGPA                        10000 non-null  float64
 1   Major Projects              10000 non-null  int64  
 2   Workshops/Certificatios     10000 non-null  int64  
 3   Mini Projects               10000 non-null  int64  
 4   Skills                      10000 non-null  int64  
 5   Communication Skill Rating  10000 non-null  float64
 6   Internship                  10000 non-null  object 
 7   Hackathon                   10000 non-null  object 
 8   12th Percentage             10000 non-null  int64  
 9   10th Percentage             10000 non-null  int64  
 10  backlogs                    10000 non-null  int64  
 11  PlacementStatus             10000 non-null  object 
 12  salary                      10000 non-null  int64  
dtypes: float64(2), int64(8), object(

In [5]:
dfc.describe()

Unnamed: 0,CGPA,Major Projects,Workshops/Certificatios,Mini Projects,Skills,Communication Skill Rating,12th Percentage,10th Percentage,backlogs,salary
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,7.69801,1.0492,2.0266,1.0132,7.5552,4.32396,69.1594,74.5015,1.7384,374935.0
std,0.640131,0.665901,0.867968,0.904272,0.927922,0.411622,10.430459,8.919527,1.39512,473364.5
min,6.5,0.0,0.0,0.0,6.0,3.0,55.0,57.0,0.0,0.0
25%,7.4,1.0,1.0,0.0,7.0,4.0,59.0,67.0,1.0,0.0
50%,7.7,1.0,2.0,1.0,8.0,4.4,70.0,73.0,1.0,0.0
75%,8.2,1.0,3.0,2.0,8.0,4.7,78.0,83.0,3.0,900000.0
max,9.1,2.0,3.0,3.0,9.0,4.8,90.0,88.0,7.0,1300000.0


In [6]:
dfc.shape

(10000, 13)

In [7]:
dfc.describe(include = 'O').T

Unnamed: 0,count,unique,top,freq
Internship,10000,2,Yes,5854
Hackathon,10000,2,Yes,7318
PlacementStatus,10000,2,NotPlaced,5803


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import pickle

**encoding & train test split & scaling**

In [9]:
le = LabelEncoder()

dfc['Internship'] = le.fit_transform(dfc['Internship'])  # Yes=1, No=0
dfc['Hackathon'] = le.fit_transform(dfc['Hackathon'])    # Yes=1, No=0
dfc['PlacementStatus'] = le.fit_transform(dfc['PlacementStatus'])  # Placed=1, NotPlaced=0

X = dfc.drop(columns=['PlacementStatus', 'salary'])  
y_placement = dfc['PlacementStatus']  
y_salary = dfc['salary']  

X_train, X_test, y_train_placement, y_test_placement, y_train_salary, y_test_salary = train_test_split(
    X, y_placement, y_salary, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**random forest model**

*classification*

In [10]:
rf_placement = RandomForestClassifier(random_state=42)
rf_placement.fit(X_train_scaled, y_train_placement)

y_pred_placement = rf_placement.predict(X_test_scaled)
accuracy = accuracy_score(y_test_placement, y_pred_placement)
print(f'Accuracy of Placement Status Prediction: {accuracy * 100:.2f}%')


Accuracy of Placement Status Prediction: 93.50%


*regression*

In [11]:
rf_salary = RandomForestRegressor(random_state=42)
rf_salary.fit(X_train_scaled, y_train_salary)

y_pred_salary = rf_salary.predict(X_test_scaled)
mse = mean_squared_error(y_test_salary, y_pred_salary)
print(f'Mean Squared Error of Salary Prediction: {mse:.2f}')


Mean Squared Error of Salary Prediction: 39346075168.57


In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train_salary)
print("Best parameters:", grid_search.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [13]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, roc_auc_score

rf_salary_best = RandomForestRegressor(
    max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200, random_state=42
)
rf_salary_best.fit(X_train_scaled, y_train_salary)

y_pred_salary_best = rf_salary_best.predict(X_test_scaled)

r2 = r2_score(y_test_salary, y_pred_salary_best)
print(f'R² Score: {r2:.4f}')

mae = mean_absolute_error(y_test_salary, y_pred_salary_best)
print(f'Mean Absolute Error (MAE): {mae:.2f}')

mse = mean_squared_error(y_test_salary, y_pred_salary_best)
print(f'Mean Squared Error (MSE): {mse:.2f}')



R² Score: 0.8352
Mean Absolute Error (MAE): 91779.32
Mean Squared Error (MSE): 36603122319.08


In [14]:
with open('rf_placement_model.pkl', 'wb') as f:
    pickle.dump(rf_placement, f)

with open('rf_salary_model.pkl', 'wb') as f:
    pickle.dump(rf_salary_best, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
