<a href="https://colab.research.google.com/github/kan0222/DATA-SCIENCE-PROJECTS/blob/main/CAT_3_student_enrollment_model%2C%2C_logistic_regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

np.random.seed(42)

print("Creating student dataset...")
n_students = 2000

data = {
    'student_id': range(n_students),
    'gpa': np.random.normal(3.2, 0.4, n_students),
    'sat_score': np.random.normal(1150, 100, n_students),
    'sessions_attended': np.random.randint(0, 4, n_students),
    'contact_counselor': np.random.choice([0, 1], n_students, p=[0.6, 0.4]),
    'distance_miles': np.random.exponential(30, n_students),
    'first_gen': np.random.choice([0, 1], n_students, p=[0.7, 0.3])
}

df = pd.DataFrame(data)

df['gpa'] = df['gpa'].clip(2.0, 4.0)
df['sat_score'] = df['sat_score'].clip(850, 1600)
df['distance_miles'] = df['distance_miles'].clip(1, 150)

def calculate_enrollment_chance(row):
    base_score = (
        0.4 * (row['gpa'] - 2.0) / 2.0 +
        0.3 * (row['sat_score'] - 850) / 750 +
        0.2 * row['sessions_attended'] / 3 +
        0.1 * row['contact_counselor'] -
        0.1 * (row['distance_miles'] > 50)
    )
    return base_score + np.random.normal(0, 0.1)

df['enrollment_score'] = df.apply(calculate_enrollment_chance, axis=1)
df['enrolled'] = (df['enrollment_score'] > 0.5).astype(int)

print(f"Dataset created: {len(df)} students")
print(f"Enrollment rate: {df['enrolled'].mean():.1%}")

features = ['gpa', 'sat_score', 'sessions_attended', 'contact_counselor', 'distance_miles', 'first_gen']
X = df[features]
y = df['enrolled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]


accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.1%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Enroll', 'Enroll']))


def get_risk_category(probability):
    if probability >= 0.7:
        return "High Interest"
    elif probability >= 0.4:
        return "Medium Interest"
    else:
        return "Needs Outreach"

def get_recommendation(probability, sessions, distance):
    if probability >= 0.7:
        return "Send welcome package"
    elif probability >= 0.4:
        if sessions == 0:
            return "Invite to info session"
        else:
            return "Follow-up call"
    else:
        if distance > 50:
            return "Virtual tour + housing info"
        else:
            return "Personalized outreach"


test_results = X_test.copy()
test_results['enrollment_prob'] = y_pred_proba
test_results['risk_category'] = [get_risk_category(p) for p in y_pred_proba]
test_results['recommendation'] = [
    get_recommendation(p, s, d) for p, s, d in zip(
        y_pred_proba,
        test_results['sessions_attended'],
        test_results['distance_miles']
    )
]
test_results['actual'] = y_test.values


print("\n" + "="*50)
print("STUDENT ENROLLMENT PREDICTION RESULTS")
print("="*50)

print(f"\nRisk Category Distribution:")
print(test_results['risk_category'].value_counts())

print(f"\nSample Predictions (first 10 students):")
sample_display = test_results[['gpa', 'sat_score', 'sessions_attended',
                              'enrollment_prob', 'risk_category', 'recommendation']].head(10)
print(sample_display.round(3))

print(f"\nFeature Importance:")
importance_df = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(importance_df)


def predict_new_student(gpa, sat, sessions, contacted, distance, first_gen):
    """Predict enrollment for a new student"""
    student_data = [[gpa, sat, sessions, contacted, distance, first_gen]]
    probability = model.predict_proba(student_data)[0, 1]
    risk = get_risk_category(probability)
    recommendation = get_recommendation(probability, sessions, distance)

    return {
        'enrollment_probability': round(probability, 3),
        'risk_category': risk,
        'recommended_action': recommendation
    }

print("\n" + "="*50)
print("NEW STUDENT PREDICTION EXAMPLE")
print("="*50)

new_student = predict_new_student(
    gpa=3.8,
    sat=1350,
    sessions=2,
    contacted=1,
    distance=25,
    first_gen=0
)

print(f"Student Profile: GPA=3.8, SAT=1350, 2 info sessions, contacted counselor")
print(f"Prediction: {new_student}")

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"✅ Model trained on {len(X_train)} students")
print(f"✅ Accuracy: {accuracy:.1%} on test set")
print(f"✅ Risk categories: High Interest, Medium Interest, Needs Outreach")
print(f"✅ Personalized recommendations generated")
print(f"✅ Ready for deployment")

Creating student dataset...
Dataset created: 2000 students
Enrollment rate: 45.2%

Training model...

Model Accuracy: 79.8%

Classification Report:
              precision    recall  f1-score   support

  Not Enroll       0.77      0.83      0.80       194
      Enroll       0.83      0.77      0.80       206

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400


STUDENT ENROLLMENT PREDICTION RESULTS

Risk Category Distribution:
risk_category
Needs Outreach     177
High Interest      128
Medium Interest     95
Name: count, dtype: int64

Sample Predictions (first 10 students):
        gpa  sat_score  sessions_attended  enrollment_prob    risk_category  \
1860  3.146   1104.805                  0             0.02   Needs Outreach   
353   3.136   1013.404                  3             0.49  Medium Interest   
1333  3.796   1207.407                  1             0.82    High Int

