In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load the Data
f1 = pd.read_csv('2022.csv')
f2 = pd.read_csv('2021.csv')
f3 = pd.read_csv('2020.csv')

# Combine Training and Test Data
train = pd.concat([f2, f3], ignore_index=True)
test = f1

# Preprocessing
def preprocess_data(df):
    df = df.copy()
    # Convert Rank columns to numeric
    df['Opening Rank'] = pd.to_numeric(df['Opening Rank'], errors='coerce')
    df['Closing Rank'] = pd.to_numeric(df['Closing Rank'], errors='coerce')
    
    # Fill missing values with median
    df['Opening Rank'].fillna(df['Opening Rank'].median(), inplace=True)
    df['Closing Rank'].fillna(df['Closing Rank'].median(), inplace=True)
    
    # Encode Categorical Columns
    categorical_cols = ['Institute', 'Academic Program Name', 'Quota', 'Seat Type', 'Gender']
    encoders = {}
    for col in categorical_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoders[col] = encoder  # Save the encoders for later use
    
    return df, encoders

# Preprocess Train and Test Data
train, train_encoders = preprocess_data(train)
test, _ = preprocess_data(test)

# Split Features and Target
X_train = train.drop(columns=['Institute'])
y_train = train['Institute']

X_test = test.drop(columns=['Institute'])
y_test = test['Institute']

# Standardize Numerical Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter Tuning for XGBoost
xgb = XGBClassifier(random_state=42, objective='multi:softprob')  # Use softprob for probabilities
param_grid = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'subsample': [0.8]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best Model
best_model = grid_search.best_estimator_

# Function for Top 10 Predictions
def get_top_10_predictions(model, input_data, encoder):
    """
    Returns the top 10 predicted colleges for a given input.
    """
    probabilities = model.predict_proba(input_data)  # Get probabilities
    top_10_indices = np.argsort(probabilities, axis=1)[:, -10:]  # Indices of top 10 probabilities
    top_10_colleges = encoder.inverse_transform(top_10_indices.flatten())  # Decode indices to college names
    return top_10_colleges

# Example User Input
user_input = pd.DataFrame({
    'Academic Program Name': [0],  # Replace with correct encoded values
    'Quota': [1],                  # Replace with correct encoded values
    'Seat Type': [2],              # Replace with correct encoded values
    'Gender': [0],                 # Replace with correct encoded values
    'Opening Rank': [5000],
    'Closing Rank': [10000],
    'Year': [2022],
    'Round': [1]
})

# Preprocess User Input
user_input_scaled = scaler.transform(user_input)

# Get Top 10 Predictions
top_10 = get_top_10_predictions(best_model, user_input_scaled, train_encoders['Institute'])

# Display Top 10 Colleges
print("Top 10 Predicted Colleges:")
for rank, college in enumerate(top_10[::-1], start=1):  # Reverse order for top-down ranking
    print(f"{rank}. {college}")


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Top 10 Predicted Colleges:
1. National Institute of Technology Goa
2. Indian Institute of Engineering Science and Technology, Shibpur
3. Assam University, Silchar
4. Indian Institute of Technology Bombay
5. National Institute of Technology, Warangal
6. Indian Institute of Technology Madras
7. Malaviya National Institute of Technology Jaipur
8. Indian Institute of Technology Kanpur
9. Indian Institute of Technology Delhi
10. National Institute of Technology Karnataka, Surathkal


In [2]:
import joblib

# Save the trained model and other preprocessing components
joblib.dump(best_model, 'admission_prediction_model.pkl')
joblib.dump(train_encoders, 'encoders.pkl')
joblib.dump(scaler, 'scaler1.pkl')


['scaler1.pkl']

In [3]:
# Load the model and preprocessing components
model = joblib.load('admission_prediction_model.pkl')
encoders = joblib.load('encoders.pkl')
scaler = joblib.load('scaler1.pkl')
