In [86]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [75]:
df=pd.read_csv("c:\\Users\\Hariprasad\\Documents\\Admission-prediction\\data\\raw\\data.csv")

In [76]:
import os
os.getcwd()

'c:\\Users\\Hariprasad\\Documents\\Admission-prediction\\notebooks'

In [77]:
df.head()

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,in_college
0,Academic,A,Male,Less Interested,Urban,56,6950000,83.0,84.09,False,True
1,Academic,A,Male,Less Interested,Urban,57,4410000,76.8,86.91,False,True
2,Academic,B,Female,Very Interested,Urban,50,6500000,80.6,87.43,False,True
3,Vocational,B,Male,Very Interested,Rural,49,6600000,78.2,82.12,True,True
4,Academic,A,Female,Very Interested,Urban,57,5250000,75.1,86.79,False,False


In [79]:
for col in df.select_dtypes(include="object").columns:
    print(f"Unique values in column '{col}':")
    print(df[col].unique())
    print()  # For better readability


Unique values in column 'type_school':
['Academic' 'Vocational']

Unique values in column 'school_accreditation':
['A' 'B']

Unique values in column 'gender':
['Male' 'Female']

Unique values in column 'interest':
['Less Interested' 'Very Interested' 'Uncertain' 'Not Interested'
 'Quiet Interested']

Unique values in column 'residence':
['Urban' 'Rural']



In [80]:
x=df.drop(columns='in_college')
y=df['in_college']

In [81]:
train_x,test_x,train_y,test_y=train_test_split(x,y)

In [82]:


# Define each pipeline with StandardScaler adjusted for sparse matrices
type_school_pipe1 = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown='ignore')),
    ("scaler", StandardScaler(with_mean=False))  # Adjusted for sparse matrices
])

school_accreditation_pipeline = Pipeline(steps=[
    ("encoder", OrdinalEncoder(categories=[['A', 'B']]))
])

gender_pipe1 = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown='ignore')),
    ("scaler", StandardScaler(with_mean=False))  # Adjusted for sparse matrices
])

interest_pipeline = Pipeline(steps=[
    ("encoder", OrdinalEncoder(categories=[
        ['Not Interested', 'Less Interested', 'Very Interested', 'Uncertain', 'Quiet Interested']
    ])),
    ("scaler", StandardScaler(with_mean=False))  # Adjusted for sparse matrices
])

residence_pipe1 = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown='ignore')),
    ("scaler", StandardScaler(with_mean=False))  # Adjusted for sparse matrices
])

parent_age_pipe1 = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True))  # StandardScaler works fine with dense data
])

parent_salary_pipe1 = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True))  # StandardScaler works fine with dense data
])

average_grades_pipe1 = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True))  # StandardScaler works fine with dense data
])

parent_in_college_pipe1 = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown='ignore')),
    ("scaler", StandardScaler(with_mean=False))  # Adjusted for sparse matrices
])

# Combine all pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("type_school", type_school_pipe1, ['type_school']),
        ("school_accreditation", school_accreditation_pipeline, ['school_accreditation']),
        ("gender", gender_pipe1, ['gender']),
        ("interest", interest_pipeline, ['interest']),
        ("residence", residence_pipe1, ['residence']),
        ("parent_age", parent_age_pipe1, ['parent_age']),
        ("parent_salary", parent_salary_pipe1, ['parent_salary']),
        ("average_grades", average_grades_pipe1, ['average_grades']),
        ("parent_in_college", parent_in_college_pipe1, ['parent_was_in_college'])
    ],
    remainder='passthrough'  # This will pass through the columns not explicitly mentioned in transformers
)


# Example usage


In [83]:
train_x=preprocessor.fit_transform(train_x)
test_x=preprocessor.transform(test_x)

In [84]:
models={'lgr':LogisticRegression(),
        'dt':DecisionTreeClassifier(),
        'rf':RandomForestClassifier()}

In [97]:


for name, model in models.items():
    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    
    # Compute metrics
    conf_matrix = confusion_matrix(test_y, pred)
    acc = accuracy_score(test_y, pred)
    prec = precision_score(test_y, pred) 
    rec = recall_score(test_y, pred)      

    print(f"Metrics for {name}:\n"
          f"Confusion Matrix:\n{conf_matrix}\n"
          f"Accuracy: {acc:.2f}\n"
          f"Precision: {prec:.2f}\n"
          f"Recall: {rec:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Metrics for lgr:
Confusion Matrix:
[[109  22]
 [ 15 104]]
Accuracy: 0.85
Precision: 0.83
Recall: 0.87
Metrics for dt:
Confusion Matrix:
[[100  31]
 [ 19 100]]
Accuracy: 0.80
Precision: 0.76
Recall: 0.84
Metrics for rf:
Confusion Matrix:
[[113  18]
 [ 14 105]]
Accuracy: 0.87
Precision: 0.85
Recall: 0.88
