In [None]:
"""
tpot
Question-26
# HandsOn - Predict The Data Scientists Salary In India from kaggle use sklearn
dataset: Predict-The-Data-Scientists-Salary-In-India_Train_Dataset.csv
Data Features:
    Name of the company (Encoded)
    Years of experience(split to min and max experience)
    Job description
    Job designation
    Job Type
    Key skills
    Location (needs LabelEncoded)
    Salary in Rupees Lakhs(To be predicted)(needs LabelEncoded)
    
Find the best estimator among below 
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier(n_estimators=100)
clf3 = ExtraTreesClassifier(n_estimators=100)
clf4 =  AdaBoostClassifier(n_estimators=100)
clf5 =  GradientBoostingClassifier(n_estimators=100)
"""

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [31]:
# Load the dataset
df = pd.read_csv("Predict-The-Data-Scientists-Salary-In-India_Train_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,experience,job_description,job_desig,job_type,key_skills,location,salary,company_name_encoded
0,0,5-7 yrs,Exp: Minimum 5 years;Good understanding of IOC...,Senior Exploit and Vulnerability Researcher,,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),6to10,3687
1,1,10-17 yrs,He should have handled a team of atleast 5-6 d...,Head SCM,,"ppc, logistics, inventory management, supply c...",Sonepat,10to15,458
2,2,5-9 yrs,Must be an effective communicator (written & s...,Deputy Manager - Talent Management & Leadershi...,Analytics,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,15to25,4195
3,3,7-10 yrs,7 - 10 years of overall experience in data e...,Associate Manager Data Engineering,Analytics,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,10to15,313
4,4,1-3 yrs,Chartered Accountancy degree or MBA in Finance...,TS- GSA- Senior Analyst,,"accounting, finance, cash flow, financial plan...",Gurgaon,3to6,1305


In [32]:
# Split experience into min_experience and max_experience
def parse_experience(exp_str):
    try:
        exp_range = str(exp_str).replace(' yrs', '').replace('yrs', '').replace(' ', '').split('-')
        if len(exp_range) == 2:
            return int(exp_range[0]), int(exp_range[1])
        else:
            return int(exp_range[0]), int(exp_range[0])
    except:
        return 0, 0

df['min_experience'], df['max_experience'] = zip(*df['experience'].map(parse_experience))

In [33]:
# Label encode 'location' and 'salary'
le_location = LabelEncoder()
df['location_encoded'] = le_location.fit_transform(df['location'].astype(str))

le_salary = LabelEncoder()
df['salary_encoded'] = le_salary.fit_transform(df['salary'].astype(str))

In [34]:
# Fill missing job_type
df['job_type'] = df['job_type'].fillna("Unknown")

In [42]:
feature_cols = ['company_name_encoded', 'min_experience', 'max_experience', 'job_description', 'job_desig', 'job_type', 'key_skills', 'location_encoded']
X = df[feature_cols].copy()
y = df['salary_encoded']

numeric_cols = ['company_name_encoded', 'min_experience', 'max_experience', 'location_encoded']
X_numeric = df[numeric_cols].fillna(0)
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

In [37]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_numeric_scaled, y, test_size=0.2, random_state=42)

In [40]:
# Classifiers
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier(n_estimators=100)
clf3 = ExtraTreesClassifier(n_estimators=100)
clf4 = AdaBoostClassifier(n_estimators=100)
clf5 = GradientBoostingClassifier(n_estimators=100)
clfs = [clf1, clf2, clf3, clf4, clf5]
names = ['DecisionTree', 'RandomForest', 'ExtraTrees', 'AdaBoost', 'GradientBoosting']
results = {}
for name, clf in zip(names, clfs):
    clf.fit(X_train, y_train)
    acc = accuracy_score(y_test, clf.predict(X_test))
    results[name] = acc
    print(f"{name}: accuracy = {acc:}")

DecisionTree: accuracy = 0.3516788689724817
RandomForest: accuracy = 0.35597071446604395
ExtraTrees: accuracy = 0.348649330977026
AdaBoost: accuracy = 0.3756627114365059
GradientBoosting: accuracy = 0.43145670285281496


In [41]:
# Best model
best_name = max(results, key=results.get)
print(f"Best estimator: {best_name} (accuracy={results[best_name]:})")

Best estimator: GradientBoosting (accuracy=0.43145670285281496)
