In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
# Load data
df = pd.read_csv("student placement data.csv").sample(10000)

In [3]:
X = df.drop(columns = ["Salary Range Expected",
                       "Suggested Job Role",
                       "Acedamic percentage in Operating Systems",
                       "percentage in Algorithms",
                       "Percentage in Programming Concepts",
                      "Percentage in Software Engineering",
                       "Percentage in Computer Networks",
                      "Percentage in Electronics Subjects",
                      "Percentage in Computer Architecture",
                      "Percentage in Mathematics",
                      "Percentage in Communication skills",
                      "Hours working per day",
                      "can work long time before system?",
                      "talenttests taken?",
                      "olympiads",
                      "Job/Higher Studies?",
                      "Taken inputs from seniors or elders",
                      "interested in games",
                      "In a Realtionship?",
                      "Gentle or Tuff behaviour?",
                      "Salary/work",])

In [4]:
# Add other columns to drop here
y = df["Suggested Job Role"]

In [5]:
df.shape

(10000, 39)

In [6]:
X.shape

(10000, 18)

In [7]:
# Label encoding for target variable
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define pipelines for preprocessing numerical and categorical data
num_col = X.select_dtypes(include=["int64"]).columns.tolist()
obj_col = X.select_dtypes(include=["object"]).columns.tolist()

In [10]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])


In [11]:
obj_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
processor = ColumnTransformer([
    ("num", num_pipeline, num_col),
    ("obj", obj_pipeline, obj_col)
])

In [13]:
# Transform the data
X_train_processed = processor.fit_transform(X_train)
X_test_processed = processor.transform(X_test)

In [14]:
# Define models for evaluation
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200),
}


In [15]:
# Evaluate models function (ensure it's defined once)
def evaluate_models(X_train, y_train, X_test, y_test, models):
    report = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        train_accuracy = model.score(X_train, y_train)
        test_accuracy = model.score(X_test, y_test)
        report[name] = [train_accuracy, test_accuracy]
    return report

In [16]:
# Evaluate models and save the best one
model_scores = evaluate_models(X_train_processed, y_train, X_test_processed, y_test, models)

In [17]:
best_model_name = max(model_scores.keys(), key=lambda x: model_scores[x][1])
best_model = models[best_model_name]

In [18]:
with open("best_student_job_role_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print(f"{best_model_name} model saved successfully!")

Random Forest model saved successfully!


In [19]:
# Save the label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [20]:
import pickle
import pandas as pd

# Load the saved model
with open("best_student_job_role_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# Example input data for prediction (make sure to format it correctly)
# Replace this with your actual input data
input_data = {
    "Logical quotient rating": [7],
    "hackathons": [5],
    "coding skills rating": [3],
    "public speaking points": [2],
    "self-learning capability?": ["no"],
    "Extra-courses did": ["yes"],
    "certifications": ["python"],
    "workshops": ["web technologies"],
    "reading and writing skills": ["medium"],
    "memory capability score": ["poor"],
    "Interested subjects": ["data engineering"],
    "interested career area ": ["developer"],
    "Type of company want to settle in?": ["Testing and Maintainance Services"],
    "Interested Type of Books": ["Cookbooks"],
    "Management or Technical": ["Management"],
    "hard/smart worker": ["smart worker"],
    "worked in teams ever?": ["no"],
    "Introvert": ["no"]
}

# Convert input data into DataFrame
input_df = pd.DataFrame(input_data)

# Preprocess the input data using the same processor used for training
input_processed = processor.transform(input_df)

# Make predictions
predictions = loaded_model.predict(input_processed)

# Decode the predictions back to original labels
predicted_labels = le.inverse_transform(predictions)

# Display the predicted job role
print(f"Predicted Job Role: {predicted_labels[0]}")


Predicted Job Role: Network Security Administrator


In [21]:
# Save the processor
with open("processor.pkl", "wb") as f:
    pickle.dump(processor, f)

print("Processor saved successfully!")

Processor saved successfully!
