In [1]:
!pip install pandas numpy scikit-learn joblib matplotlib




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd

df = pd.read_csv("interview_prediction_dataset.csv")
df.head()

Unnamed: 0,Skills,GPA,Experience,Certifications,Score,Age,Industry
0,"Problem Solving, Teamwork, Java, JavaScript",3.9,7.3,4,85.73,40,IT
1,"Python, Java, Machine Learning",2.92,3.3,2,45.93,41,Healthcare
2,"Data Analysis, Machine Learning, Communication...",3.94,8.3,1,80.61,40,IT
3,"Communication, Data Analysis, Java, C++, Python",2.86,2.9,2,57.99,29,IT
4,"Teamwork, Python, Data Analysis, Leadership",3.95,2.3,2,72.2,31,IT


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

skills_encoder = MultiLabelBinarizer()
skills_numeric = skills_encoder.fit_transform(df['Skills'])
print("Skills numeric shape:", skills_numeric.shape)

# Scale numeric columns
numeric_cols = ['GPA', 'Experience', 'Certifications', 'Age']
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(df[numeric_cols])
print("Scaled numeric features shape:", numeric_scaled.shape)

# Encode industry
industry_numeric = pd.get_dummies(df['Industry'])
print("Industry numeric shape:", industry_numeric.shape)

# Combine all features
X = np.hstack([skills_numeric, numeric_scaled, industry_numeric.values])
y = df['Score']
print("Final feature matrix shape (X):", X.shape)
print("Target vector shape (y):", y.shape)

Skills numeric shape: (500, 10)
Scaled numeric features shape: (500, 4)
Industry numeric shape: (500, 6)
Final feature matrix shape (X): (500, 20)
Target vector shape (y): (500,)


In [6]:
from sklearn.model_selection import train_test_split

# Use 80% of the data for training, 20% for testing
# random_state=42 ensures reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, # features
    y, # score
    test_size=0.2,  
    random_state=42
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (400, 20) (400,)
Test set shape: (100, 20) (100,)


In [7]:
from sklearn.ensemble import RandomForestRegressor

# Create the model
# n_estimators=300 = number of trees in the forest
model = RandomForestRegressor(n_estimators=300, random_state=42)

# train the model on the training data
model.fit(X_train, y_train)

print("Model training completed!")

Model training completed!


In [8]:
from sklearn.metrics import mean_absolute_error, r2_score

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Absolute Error (MAE):", round(mae, 2))
print("R2 Score:", round(r2, 2))

Mean Absolute Error (MAE): 4.95
R2 Score: 0.77


In [9]:
import joblib

# Save the trained model
joblib.dump(model, "interview_model.pkl")

# Save the scaler used for numeric features
joblib.dump(scaler, "scaler.pkl")

# Save the skills encoder
joblib.dump(skills_encoder, "skills_encoder.pkl")

# Save the industry column names
joblib.dump(industry_numeric.columns.tolist(), "industry_columns.pkl")

print("All model and preprocessing files saved successfully!")

All model and preprocessing files saved successfully!
