In [77]:
# Importing all the necessary files
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [78]:
#Read the dataFram from the csv file
df = pd.read_csv('student_performance.csv')

In [79]:
#Display the first 5 rows of the dataFrame
df.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade
0,1,John,Male,85,15,78,1,High,80
1,2,Sarah,Female,90,20,85,2,Medium,87
2,3,Alex,Male,78,10,65,0,Low,68
3,4,Michael,Male,92,25,90,3,High,92
4,5,Emma,Female,88,18,82,2,Medium,85


In [80]:
#Display the information of the dataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   StudentID                  10 non-null     int64 
 1   Name                       10 non-null     object
 2   Gender                     10 non-null     object
 3   AttendanceRate             10 non-null     int64 
 4   StudyHoursPerWeek          10 non-null     int64 
 5   PreviousGrade              10 non-null     int64 
 6   ExtracurricularActivities  10 non-null     int64 
 7   ParentalSupport            10 non-null     object
 8   FinalGrade                 10 non-null     int64 
dtypes: int64(6), object(3)
memory usage: 852.0+ bytes


In [81]:
# Drop the Name column

#Name isnt relevant to the dataset so we can drop it. We can use StudentID instead of it
df.drop('Name', axis=1, inplace=True)

#inplace makes the dataset drop the column permanently

In [82]:
df.head()

Unnamed: 0,StudentID,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade
0,1,Male,85,15,78,1,High,80
1,2,Female,90,20,85,2,Medium,87
2,3,Male,78,10,65,0,Low,68
3,4,Male,92,25,90,3,High,92
4,5,Female,88,18,82,2,Medium,85


In [83]:
# Convert Gender to binary

le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

In [84]:
df.head()
#This shows the Male gender is turned to 1 and Female gender is turned to 0

Unnamed: 0,StudentID,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade
0,1,1,85,15,78,1,High,80
1,2,0,90,20,85,2,Medium,87
2,3,1,78,10,65,0,Low,68
3,4,1,92,25,90,3,High,92
4,5,0,88,18,82,2,Medium,85


In [85]:
# Convert ParentalSupport to binary

df['ParentalSupport'] = le.fit_transform(df['ParentalSupport'])

In [86]:
df.tail()

Unnamed: 0,StudentID,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade
5,6,0,95,30,88,1,0,90
6,7,1,70,8,60,0,1,62
7,8,0,85,17,77,1,2,78
8,9,1,82,12,70,2,1,72
9,10,0,91,22,86,3,0,88


In [87]:
df.head(10)

Unnamed: 0,StudentID,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade
0,1,1,85,15,78,1,0,80
1,2,0,90,20,85,2,2,87
2,3,1,78,10,65,0,1,68
3,4,1,92,25,90,3,0,92
4,5,0,88,18,82,2,2,85
5,6,0,95,30,88,1,0,90
6,7,1,70,8,60,0,1,62
7,8,0,85,17,77,1,2,78
8,9,1,82,12,70,2,1,72
9,10,0,91,22,86,3,0,88


In [88]:
df.drop('Gender', axis=1, inplace=True)

In [89]:
#split 70-30 to train and test data


X = df.drop(['FinalGrade', 'StudentID'], axis=1)
y = df['FinalGrade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [90]:
X

Unnamed: 0,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport
0,85,15,78,1,0
1,90,20,85,2,2
2,78,10,65,0,1
3,92,25,90,3,0
4,88,18,82,2,2
5,95,30,88,1,0
6,70,8,60,0,1
7,85,17,77,1,2
8,82,12,70,2,1
9,91,22,86,3,0


In [91]:
y

0    80
1    87
2    68
3    92
4    85
5    90
6    62
7    78
8    72
9    88
Name: FinalGrade, dtype: int64

In [92]:
# Train the model

model_rf = RandomForestClassifier()

model_rf.fit(X_train, y_train)

# Predict the model

y_pred = model_rf.predict(X_test)

# Evaluate the model

accuracy_score(y_test, y_pred)



0.0

In [93]:
#Decision Tree
model_dt = DecisionTreeClassifier(random_state=2)

model_dt.fit(X_train, y_train)

y_pred = model_dt.predict(X_test)

accuracy_score(y_test, y_pred)



0.0

In [94]:
# Linear Regression

model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)

y_pred = [round(i) for i in y_pred]

accuracy_score(y_test, y_pred)



0.3333333333333333

In [95]:
# Tabular show of the results

results = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'Linear Regression'],
    'Accuracy': [accuracy_score(y_test, model_rf.predict(X_test)),
                 accuracy_score(y_test, model_dt.predict(X_test)),
                 accuracy_score(y_test, y_pred)]
})

results

Unnamed: 0,Model,Accuracy
0,Random Forest,0.0
1,Decision Tree,0.0
2,Linear Regression,0.333333
