<a href="https://colab.research.google.com/github/ishahmshah1025/SIH-2024/blob/main/Prediction%20Models/compliance_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils import resample

np.random.seed(42)

n_samples = 1000
data = {
    'infrastructure_quality': np.random.randint(1, 10, n_samples),
    'faculty_qualifications': np.random.randint(1, 10, n_samples),
    'student_performance': np.random.randint(1, 10, n_samples),
    'financial_stability': np.random.randint(1, 10, n_samples),
    'inspection_score': np.random.randint(1, 10, n_samples),  # Scale 1-10
}

# Label 0: Non-compliant, Label 1: Compliant
threshold = 7
labels = (
    (data['infrastructure_quality'] + data['faculty_qualifications'] +
     data['student_performance'] + data['financial_stability'] +
     data['inspection_score']) / 5 >= threshold
).astype(int)

# Create a DataFrame from the data
df = pd.DataFrame(data)
df['label'] = labels


print(df['label'].value_counts())

# Balancing the dataset by oversampling the minority class
df_majority = df[df['label'] == 0]
df_minority = df[df['label'] == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,  # sample with replacement
                                 n_samples=len(df_majority),  # match majority class
                                 random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
print(df_balanced['label'].value_counts())

X = df_balanced.drop('label', axis=1)
y = df_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 2, 5]
}

xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nTest Accuracy:", accuracy)
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)

# gridsearch cv
def predict_compliance():
    print("\nEnter the following details to predict compliance status:")
    infrastructure_quality = int(input("Infrastructure Quality (1-10): "))
    faculty_qualifications = int(input("Faculty Qualifications (1-10): "))
    student_performance = int(input("Student Performance (1-10): "))
    financial_stability = int(input("Financial Stability (1-10): "))
    inspection_score = int(input("Inspection Score (1-10): "))

    # input part
    input_data = pd.DataFrame({
        'infrastructure_quality': [infrastructure_quality],
        'faculty_qualifications': [faculty_qualifications],
        'student_performance': [student_performance],
        'financial_stability': [financial_stability],
        'inspection_score': [inspection_score]
    })


    prediction = best_xgb_model.predict(input_data)[0]

    # Output
    status = "Compliant" if prediction == 1 else "Non-compliant"
    print(f"\nThe institution is predicted to be: {status}")


predict_compliance()


label
0    952
1     48
Name: count, dtype: int64
label
0    952
1    952
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 1.0}
Best Cross-Validation Score: 0.9947452901917714

Test Accuracy: 0.9912587412587412

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       284
           1       0.98      1.00      0.99       288

    accuracy                           0.99       572
   macro avg       0.99      0.99      0.99       572
weighted avg       0.99      0.99      0.99       572


Confusion Matrix:
 [[279   5]
 [  0 288]]

Enter the following details to predict compliance status:
