<a href="https://colab.research.google.com/github/keshavisha/Job_placement/blob/main/Model_Parthib_Isha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

In [None]:
import sys
sys.tracebacklimit = None

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from catboost import  CatBoostRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
import joblib

# Load the dataset
dataset = pd.read_csv('Placement_Data_Full_Class.xls')
dataset=dataset.drop(['sl_no'],axis=1)
# One-hot encode the categorical features
categorical_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols = pd.DataFrame(encoder.fit_transform(dataset[categorical_cols]))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
dataset.drop(categorical_cols ,axis=1, inplace=True)
dataset = pd.concat([dataset, encoded_cols], axis=1)

# Split the dataset into features and target variables
X = dataset.drop(['status', 'salary'], axis=1)
y = dataset['status']

# Fit the PCA model to the feature matrix
pca = PCA(n_components=7)
pca.fit(X)

# Apply the PCA transformation to the feature matrix
X = pca.transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Train the classification model
clf_model = LogisticRegression()
clf_model.fit(X_train, y_train)

# Evaluate the performance of the classification model
clf_accuracy = clf_model.score(X_test, y_test)


# Filter the data for candidates who were placed
placed_data = dataset[dataset['status'] == 'Placed']

# Split the placed data into features and target variables
placed_data1=placed_data[placed_data['salary']<400000]
X_placed = placed_data1.drop(['status', 'salary'], axis=1)
y_placed = placed_data['salary']
y_placed=y_placed[y_placed<400000]
# Normalize the target variable
scaler = MinMaxScaler()
y_placed = scaler.fit_transform(y_placed.values.reshape(-1, 1))

# Apply the PCA transformation to the placed data
X_placed = pca.transform(X_placed)

# Split the placed data into training and testing sets
X_placed_train, X_placed_test, y_placed_train, y_placed_test = train_test_split(X_placed, y_placed, test_size=0.2, random_state=42)

# Train the regression model on the placed data
reg_model = CatBoostRegressor(learning_rate=0.4,reg_lambda=0.24, loss_function='RMSE', iterations=7500)
reg_model.fit(X_placed_train, y_placed_train)


# Save the models and PCA object
# Evaluate the performance of the regression model
y_placed_pred = reg_model.predict(X_placed_test)
print("MSE R2:", mean_squared_error(y_placed_test,y_placed_pred))
print("Classification accuracy: {:.2f}%".format(clf_accuracy * 100))
# Save the models and PCA object
joblib.dump(encoder, 'placement_encoder.joblib')
joblib.dump(clf_model, 'placement_classifier.joblib')
joblib.dump(reg_model, 'placement_regressor.joblib')
joblib.dump(pca, 'placement_pca.joblib')
joblib.dump(scaler, 'placement_scaler.joblib')
#OUTPUT: Streaming output truncated to the last 5000 lines.
#2502:	learn: 0.0000000	total: 2.29s	remaining: 4.58s
#2503:	learn: 0.0000000	total: 2.29s	remaining: 4.58s
#2504:	learn: 0.0000000	total: 2.3s	remaining: 4.58s
#2505:	learn: 0.0000000	total: 2.3s	remaining: 4.58s
#2506:	learn: 0.0000000	total: 2.3s	remaining: 4.58s

In [None]:
def predict_salary(record):
    # Load the PCA object from file
    pca = joblib.load('/content/placement_pca.joblib')
    
    # Load the encoder object from file
    encoder = joblib.load('/content/placement_encoder.joblib')
    
    # Load the classification model from file
    clf_model = joblib.load('/content/placement_classifier.joblib')
    
    # Load the regression model from file
    reg_model = joblib.load('/content/placement_regressor.joblib')
    
    # Load the scaler object from file
    scaler = joblib.load('/content/placement_scaler.joblib')
    
    # Convert the record to a dataframe with a single row
    df = pd.DataFrame([record])
    
    # One-hot encode the categorical features
    categorical_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
    encoded_cols = pd.DataFrame(encoder.transform(df[categorical_cols]))
    encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
    df.drop(categorical_cols ,axis=1, inplace=True)
    df = pd.concat([df, encoded_cols], axis=1)
    
    # Apply the PCA transformation to the input record
    transformed_record = pca.transform(df)

    # Predict the placement status
    classification_prediction = clf_model.predict(transformed_record)[0]
    print(classification_prediction)
    
    if classification_prediction == 'Placed':
        # Predict the salary
        regression_prediction = reg_model.predict(transformed_record)[0]
        # Inverse transform the scaled salary value to get the actual salary
        predicted_salary = scaler.inverse_transform(np.array(regression_prediction).reshape(-1,1))[0][0]
        return predicted_salary
    else:
        return "Not placed"


In [None]:
record = {
    'gender': 'M',
    'ssc_p': 83.0,
    'ssc_b': 'Central',
    'hsc_p': 88.0,
    'hsc_b': 'Central',
    'hsc_s': 'Science',
    'degree_p': 87.0,
    'degree_t': 'Sci&Tech',
    'workex': 'No',
    'etest_p': 83.0,
    'specialisation': 'Mkt&Fin',
    'mba_p': 88.8
}

predict_salary(record)

Placed


275425.16043760837