In [15]:
import sys
assert sys.version_info >= (3, 5)
import sklearn
assert sklearn.__version__ >= "0.20"
# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import urllib.request
from sklearn.linear_model import LinearRegression
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
sdata = pd.read_csv("data/StudentsPerformance.csv")


In [3]:
sdata["average score"] = (sdata["math score"] + sdata["reading score"] + sdata["writing score"])/3


In [4]:
sdata["average score"]

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: average score, Length: 1000, dtype: float64

In [5]:

sdata.loc[[0]]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average score
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667


In [6]:
from sklearn.preprocessing import OneHotEncoder

en = OneHotEncoder(handle_unknown="ignore")
en_temp = pd.DataFrame(en.fit_transform(sdata[["gender"]]).toarray())
en_temp.columns = en.get_feature_names_out(["gender"])
sdata.drop(["gender"],axis=1,inplace=True)
sdata = sdata.join(en_temp)


en_temp = pd.DataFrame(en.fit_transform(sdata[["race/ethnicity"]]).toarray())
en_temp.columns = en.get_feature_names_out(["race/ethnicity"])
sdata.drop(["race/ethnicity"],axis=1,inplace=True)
sdata = sdata.join(en_temp)



en_temp = pd.DataFrame(en.fit_transform(sdata[["test preparation course"]]).toarray())
en_temp.columns = en.get_feature_names_out(["test preparation course"])
sdata.drop(["test preparation course"],axis=1,inplace=True)
sdata = sdata.join(en_temp)

en_temp = pd.DataFrame(en.fit_transform(sdata[["lunch"]]).toarray())
en_temp.columns = en.get_feature_names_out(["lunch"])
sdata.drop(["lunch"],axis=1,inplace=True)
sdata = sdata.join(en_temp)






In [7]:
sdata["parental level of education"] = sdata["parental level of education"].replace('some high school',0)
sdata["parental level of education"] = sdata["parental level of education"].replace('high school',1)
sdata["parental level of education"] = sdata["parental level of education"].replace('some college',2)
sdata["parental level of education"] = sdata["parental level of education"].replace("associate's degree", 3)
sdata["parental level of education"] = sdata["parental level of education"].replace("bachelor's degree" ,4)
sdata["parental level of education"] = sdata["parental level of education"].replace("master's degree",5)








In [8]:
sdata.loc[[0]]
sdata.dtypes

parental level of education            int64
math score                             int64
reading score                          int64
writing score                          int64
average score                        float64
gender_female                        float64
gender_male                          float64
race/ethnicity_group A               float64
race/ethnicity_group B               float64
race/ethnicity_group C               float64
race/ethnicity_group D               float64
race/ethnicity_group E               float64
test preparation course_completed    float64
test preparation course_none         float64
lunch_free/reduced                   float64
lunch_standard                       float64
dtype: object

In [9]:
for x in sdata.columns:
         
    print(x+" coefficient",sdata["math score"].corr(sdata[x],method="pearson"))

#Correlations for math score

parental level of education coefficient 0.159431818157356
math score coefficient 1.0
reading score coefficient 0.8175796636720539
writing score coefficient 0.802642045949808
average score coefficient 0.9187457588383446
gender_female coefficient -0.16798223810035579
gender_male coefficient 0.16798223810035579
race/ethnicity_group A coefficient -0.09197709990214445
race/ethnicity_group B coefficient -0.08425005999398143
race/ethnicity_group C coefficient -0.07338687290137136
race/ethnicity_group D coefficient 0.05007071350889431
race/ethnicity_group E coefficient 0.2058545809467559
test preparation course_completed coefficient 0.17770246930439465
test preparation course_none coefficient -0.17770246930439465
lunch_free/reduced coefficient -0.35087664559186066
lunch_standard coefficient 0.35087664559186066


In [10]:
for x in sdata.columns:
         
    print(x+" coefficient",sdata["reading score"].corr(sdata[x],method="pearson"))

#Correlations for reading score

parental level of education coefficient 0.19090826476420367
math score coefficient 0.817579663672054
reading score coefficient 0.9999999999999999
writing score coefficient 0.9545980771462477
average score coefficient 0.9703306887176952
gender_female coefficient 0.24431260787747192
gender_male coefficient -0.24431260787747192
race/ethnicity_group A coefficient -0.09627399708853399
race/ethnicity_group B coefficient -0.060283286992506575
race/ethnicity_group C coefficient -0.0030744299008463815
race/ethnicity_group D coefficient 0.03517659137232693
race/ethnicity_group E coefficient 0.10671182951929853
test preparation course_completed coefficient 0.24178043354875137
test preparation course_none coefficient -0.24178043354875137
lunch_free/reduced coefficient -0.22956032166228102
lunch_standard coefficient 0.22956032166228102


In [11]:
for x in sdata.columns:
         
    print(x+" coefficient",sdata["writing score"].corr(sdata[x],method="pearson"))

#Correlations for writing score

parental level of education coefficient 0.23671517332205233
math score coefficient 0.802642045949808
reading score coefficient 0.9545980771462477
writing score coefficient 1.0
average score coefficient 0.9656672374542071
gender_female coefficient 0.3012249355007125
gender_male coefficient -0.3012249355007125
race/ethnicity_group A coefficient -0.11071415908289269
race/ethnicity_group B coefficient -0.07825401387470705
race/ethnicity_group C coefficient -0.010202872032835975
race/ethnicity_group D coefficient 0.082031791913733
race/ethnicity_group E coefficient 0.08907680313022083
test preparation course_completed coefficient 0.312946284485956
test preparation course_none coefficient -0.312946284485956
lunch_free/reduced coefficient -0.2457686763842184
lunch_standard coefficient 0.2457686763842184


In [12]:


for x in sdata.columns:
         
    print(x+" coefficient",sdata["average score"].corr(sdata[x],method="pearson"))

#Correlations for average score

parental level of education coefficient 0.2057846688566186
math score coefficient 0.9187457588383445
reading score coefficient 0.9703306887176952
writing score coefficient 0.965667237454207
average score coefficient 1.0
gender_female coefficient 0.13086122988485377
gender_male coefficient -0.13086122988485377
race/ethnicity_group A coefficient -0.1048034150154246
race/ethnicity_group B coefficient -0.07824653624024683
race/ethnicity_group C coefficient -0.03069059943399047
race/ethnicity_group D coefficient 0.05890160613453797
race/ethnicity_group E coefficient 0.14104990062563996
test preparation course_completed coefficient 0.25670970665622217
test preparation course_none coefficient -0.25670970665622217
lunch_free/reduced coefficient -0.29006402187118774
lunch_standard coefficient 0.29006402187118774


In [13]:
from sklearn.datasets import fetch_openml 

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix , mean_squared_error, r2_score


X_train,X_test, y_train, y_test = train_test_split(sdata[["math score"]], sdata[["average score"]], random_state=0)

model = LinearRegression()

 
model.fit(X_train.values.reshape(-1,1),y_train.values.reshape(-1,1))
y_pred = model.predict(X_test)
mse =mean_squared_error(X_test,y_pred)

#confuse = confusion_matrix(y_test,y_pred)

print(r2_score(X_test,y_pred))
# cross_val_score(model, X_train, y_train, scoring="accuracy")




0.9681436996372448


