In [13]:
import pandas as pd
import numpy as np

# Cleaning the Data

In [40]:
df=pd.read_csv("candidate_salary_data.csv")

In [41]:
df.head()

Unnamed: 0,degree_type,technical_skills,soft_skills,cgpa,projects,internships,communication_level,technical_skill_level,years_of_experience,specialization,package
0,PhD,2,8,7.53,4,0,5,3,2,App Development,13.82
1,MCA,7,10,8.22,7,4,2,4,2,App Development,19.18
2,B.Sc,9,9,9.55,0,1,5,3,1,Cybersecurity,13.19
3,PhD,10,1,8.54,1,2,2,5,3,Cybersecurity,16.68
4,BCA,18,1,5.23,6,2,5,5,2,Data Science,13.73


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   degree_type            10000 non-null  object 
 1   technical_skills       10000 non-null  int64  
 2   soft_skills            10000 non-null  int64  
 3   cgpa                   10000 non-null  float64
 4   projects               10000 non-null  int64  
 5   internships            10000 non-null  int64  
 6   communication_level    10000 non-null  int64  
 7   technical_skill_level  10000 non-null  int64  
 8   years_of_experience    10000 non-null  int64  
 9   specialization         8552 non-null   object 
 10  package                10000 non-null  float64
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [43]:
df=df.dropna()

In [44]:
df=df.replace(["", " ", "None", "none", "nan", None],np.nan)

In [45]:
df.groupby("degree_type")["internships"].count()

degree_type
B.Sc      1240
B.Tech    1228
BCA       1189
M.Sc      1220
M.Tech    1226
MCA       1260
PhD       1189
Name: internships, dtype: int64

In [46]:
degre_map={
    "B.Sc":1,
    "B.Tech":2,
    "BCA":3,
    "M.Sc":4,
    "M.Tech":5,
    "MCA":6,
    "PhD":7
}
    

In [47]:
df["degree_type"]=df["degree_type"].map(degre_map)

In [51]:
df["degree_type"]=df["degree_type"].astype("int64")

In [67]:
spmap={
    'App Development':1,
    'Cybersecurity':2,
    'Data Science':3,
    'Cloud Computing':4,
    'AI/ML':5,
    'Web Development':6
}


In [69]:
df["specialization"]=df["specialization"].map(spmap)

In [119]:
# bucketting cgpa
def bucketcgpa(data):
    if data<6:
        return 1
    elif data<8:
        return 2
    else:
        return 3

df["cgpa"]=df["cgpa"].apply(bucketcgpa)

In [137]:
df["project_exp"]=df["years_of_experience"]*df["projects"]

In [184]:
df = df.drop('cgpa_skill_interaction', axis=1)


In [185]:
cols = [col for col in df.columns if col != 'package'] + ['package']
df = df[cols]

In [188]:
df.head()

Unnamed: 0,degree_type,technical_skills,soft_skills,cgpa,projects,internships,communication_level,technical_skill_level,years_of_experience,specialization,project_exp,package
0,7,2,8,2,4,0,5,3,2,1,8,13.82
1,6,7,10,3,7,4,2,4,2,1,14,19.18
2,1,9,9,3,0,1,5,3,1,2,0,13.19
3,7,10,1,3,1,2,2,5,3,2,3,16.68
4,3,18,1,1,6,2,5,5,2,3,12,13.73


In [190]:
df.to_csv("salary_train.csv",index=False)

# Train The Model

In [191]:
data=pd.read_csv("salary_train.csv")

In [192]:
data.head()

Unnamed: 0,degree_type,technical_skills,soft_skills,cgpa,projects,internships,communication_level,technical_skill_level,years_of_experience,specialization,project_exp,package
0,7,2,8,2,4,0,5,3,2,1,8,13.82
1,6,7,10,3,7,4,2,4,2,1,14,19.18
2,1,9,9,3,0,1,5,3,1,2,0,13.19
3,7,10,1,3,1,2,2,5,3,2,3,16.68
4,3,18,1,1,6,2,5,5,2,3,12,13.73


In [193]:
from sklearn.ensemble import RandomForestRegressor

In [194]:
model=RandomForestRegressor( 
    n_estimators=200,         # More trees
    max_depth=10,             # Limit depth to avoid overfitting
    min_samples_split=5,      # Control splits
    min_samples_leaf=3,       # Prevent very small leaves
    random_state=42,
    n_jobs=-1   )

In [195]:
x=data.iloc[ : , :-1]

In [196]:
y=data.iloc[ : ,-1]

In [197]:
model.fit(x,y)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Test The model

In [198]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error 

In [199]:
test_data=pd.read_csv("salary_test.csv")

In [200]:
x_test=test_data.iloc[ : , :-1]

In [201]:
y_test=test_data.iloc[ : ,-1]

In [202]:
y_predict=model.predict(x_test)

In [203]:
y_predict

array([12.85439731, 15.22105775, 14.68085299, 18.42758661, 11.74003801,
       16.12239767, 17.11296905, 14.77812884, 14.61715065, 16.95267337,
       14.37964247, 13.38699644, 16.27983643, 15.57218852, 13.13320239,
       15.28972113, 17.69342503, 13.07116597, 18.29811529, 14.00753168,
       17.34382633, 15.3106329 , 18.01884529, 13.67377975, 18.02068996,
       13.49917989, 15.50319339, 14.48749683, 15.89562797, 15.63397726,
       16.48695768, 14.71630736, 16.23446268, 14.0129801 , 17.75950031,
       15.04338632, 14.52803266, 14.5909333 , 16.07750826, 17.22517086,
       15.43276486, 14.97505807, 15.57604127, 13.26527307, 15.96945912,
       17.05425346, 15.82823146, 16.34641121, 13.72084941, 17.88469985,
       16.47851334, 14.43576894, 13.75250757, 17.75007391])

In [204]:
print(f"r2_score:{r2_score(y_test,y_predict)}")

r2_score:0.7342906026626983


In [206]:
print("RMSE:", mean_squared_error(y_test, y_predict))

RMSE: 1.3883004011910884


# Saving the model

In [207]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'salary_model.pkl')


['salary_model.pkl']