In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('Students.csv')


In [3]:
df.shape

(3614, 16)

In [4]:
df.head()

Unnamed: 0,Student_Name,College_Name,Stream,Year_of_Study,AI_Tools_Used,Daily_Usage_Hours,Use_Cases,Trust_in_AI_Tools,Impact_on_Grades,Do_Professors_Allow_Use,Preferred_AI_Tool,Awareness_Level,Willing_to_Pay_for_Access,State,Device_Used,Internet_Access
0,Aarav,Indian Institute of Information Technology,Engineering,4,Gemini,0.9,"Assignments, Coding Help",2,2,No,Copilot,9,Yes,Uttar pradesh,Mobile,Poor
1,Vivaan,"Government Ram Bhajan Rai NES College, Jashpur",Commerce,2,ChatGPT,3.4,Learning new topics,3,-3,Yes,Other,6,No,Chhattisgarh,Laptop,Poor
2,Aditya,Dolphin PG Institute of BioMedical & Natural,Science,2,Copilot,3.6,"MCQ Practice, Projects",5,0,No,Gemini,1,No,Uttarakhand,Tablet,Poor
3,Vihaan,Shaheed Rajguru College of Applied Sciences for,Arts,2,Copilot,2.9,Content Writing,5,2,Yes,Gemini,5,No,Delhi ncr,Laptop,High
4,Arjun,Roorkee College of Engineering,Science,1,Gemini,0.9,"Doubt Solving, Resume Writing",1,3,Yes,Other,8,Yes,Uttarakhand,Laptop,Medium


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3614 entries, 0 to 3613
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Student_Name               3614 non-null   object 
 1   College_Name               3614 non-null   object 
 2   Stream                     3614 non-null   object 
 3   Year_of_Study              3614 non-null   int64  
 4   AI_Tools_Used              3614 non-null   object 
 5   Daily_Usage_Hours          3614 non-null   float64
 6   Use_Cases                  3614 non-null   object 
 7   Trust_in_AI_Tools          3614 non-null   int64  
 8   Impact_on_Grades           3614 non-null   int64  
 9   Do_Professors_Allow_Use    3614 non-null   object 
 10  Preferred_AI_Tool          3614 non-null   object 
 11  Awareness_Level            3614 non-null   int64  
 12  Willing_to_Pay_for_Access  3614 non-null   object 
 13  State                      2000 non-null   objec

In [6]:
# Define preprocessors
numeric_features = ['Year_of_Study', 'Daily_Usage_Hours', 'Trust_in_AI_Tools', 'Awareness_Level']
categorical_features = ['Stream', 'Do_Professors_Allow_Use', 'Preferred_AI_Tool']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)
])

In [7]:
# Pipelines
dt_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', DecisionTreeClassifier(random_state=42))
])

rf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

xgb_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

In [8]:
X = df.drop('Impact_on_Grades',axis=1)
y=df['Impact_on_Grades']

# encoding target column
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y_encoded,test_size=0.2,random_state=42)


In [10]:
# Grid parameters

dt_params = {
    'clf__criterion': ['gini', 'entropy','log_loss'],  # or 'log_loss' for newer versions
    'clf__max_depth': [2, 3, 4, 5, 6],
    'clf__min_samples_split': [2, 3, 4],
    'clf__min_samples_leaf': [1, 2, 3]
}
rf_params = {'clf__n_estimators': [100], 'clf__max_depth': [5, 10]}
xgb_params = {'clf__n_estimators': [100], 'clf__max_depth': [3, 5], 'clf__learning_rate': [0.1, 0.2]}




# Grid search
dt_grid = GridSearchCV(dt_pipeline, dt_params, cv=5, scoring='accuracy')
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=3, scoring='accuracy')
xgb_grid = GridSearchCV(xgb_pipeline, xgb_params, cv=3, scoring='accuracy')

# Fit and predict
dt_grid.fit(X_train, y_train)
rf_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)

# Evaluation
print("Decision Tree Accuracy:", dt_grid.score(X_test, y_test))
print("RandomForest Accuracy:", rf_grid.score(X_test, y_test))
print("XGBoost Accuracy:", xgb_grid.score(X_test, y_test))

Decision Tree Accuracy: 0.2544951590594744
RandomForest Accuracy: 0.6417704011065007
XGBoost Accuracy: 0.5822959889349931


# More Improvement Needed