In [155]:
import pandas as pd
import numpy as np

In [156]:
data = pd.read_csv("stud.csv")
df = data.copy()
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [157]:
df1 = data.copy()

### No missing values as such in the dataset

In [158]:
df.isna().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [159]:
y = df.iloc[:,5:6]
y

Unnamed: 0,math_score
0,72
1,69
2,90
3,47
4,76
...,...
995,88
996,62
997,59
998,68


In [160]:
df.drop("math_score",axis=1,inplace=True)

In [161]:
x = df

In [162]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


### Lets define a correlation finder function for columns in the df

In [163]:
y["math_score"]

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [164]:
y.dtypes

math_score    int64
dtype: object

In [165]:
from scipy.stats import pearsonr
from scipy.stats import f_oneway

In [166]:
def correlation_finder(x,y):
    
    con_cols = [col for col in x.columns if x[col].dtype != "object"]
    cat_cols = [col for col in x.columns if x[col].dtype == "object"]
    yName = [col for col in y.columns]
    
    corr = {}
    
    if y.dtypes[0] != "object":
        # the target is continuous in nature
        # hence we need to check for pearson correlation and anova
        allAtOnceCols = {}        
        for col in x[con_cols].columns:
            allAtOnceCols[col] = pearsonr(x[col],np.array(y[yName]).reshape(-1))[1]
        for col in x[cat_cols].columns:
            # corr[col] = f_oneway(x[col],np.array(y[yName]).reshape(-1))
            # allAtOnceCols = [x[x[col] == name] for name in cat_cols]
            # allAtOnceCols.append(x[cat_cols].columns)
        # allAtOnceCols = [*x[cat_cols]]
            categories = [cat for cat in x[col].value_counts().index]
            # print(categories)
            allAtOnceCols[col]  = f_oneway(*[df1[df1[col] == cat][yName] for cat in categories])[1][0]
        # return allAtOnceCols
        correlated = {}
        for k, p_val in allAtOnceCols.items():
            if p_val < 0.5:
                correlated[k] = p_val
        return correlated
            # return y[yName]
            # return x[col]
    # return x[con_cols].shape,y.shape
    # return allAtOnceCols

    # return cat_cols


In [167]:
correlation_finder(x=x,y =y)

{'reading_score': 1.7877531099061487e-241,
 'writing_score': 3.376027042567673e-226,
 'gender': 9.120185549332254e-08,
 'race_ethnicity': 1.3732194030370688e-11,
 'parental_level_of_education': 5.592272384107223e-06,
 'lunch': 2.4131955993147374e-30,
 'test_preparation_course': 1.5359134607155386e-08}

In [168]:
df


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [169]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [170]:
y

Unnamed: 0,math_score
0,72
1,69
2,90
3,47
4,76
...,...
995,88
996,62
997,59
998,68


In [171]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [172]:
x_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57
...,...,...,...,...,...,...,...
106,female,group D,master's degree,standard,none,100,100
270,male,group C,bachelor's degree,standard,none,63,61
860,female,group C,associate's degree,standard,none,62,53
435,male,group C,some college,free/reduced,completed,48,53


In [173]:
x_train.dtypes

gender                         object
race_ethnicity                 object
parental_level_of_education    object
lunch                          object
test_preparation_course        object
reading_score                   int64
writing_score                   int64
dtype: object

In [174]:
x_train.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'reading_score', 'writing_score'],
      dtype='object')

In [175]:
x_train["parental_level_of_education"].value_counts()

parental_level_of_education
some college          182
associate's degree    179
high school           159
some high school      137
bachelor's degree      96
master's degree        47
Name: count, dtype: int64

In [176]:
cat_cols = [col for col in x_train.columns if x_train[col].dtype == "object"]
con_cols = [col for col in x_train.columns if x_train[col].dtype != "object"]


In [177]:
cat_cols

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [178]:
con_cols

['reading_score', 'writing_score']

In [179]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
sc = StandardScaler()
oe = OneHotEncoder(handle_unknown="ignore")

In [180]:
x_train[con_cols] = sc.fit_transform(x_train[con_cols])
x_test[con_cols] = sc.transform(x_test[con_cols])

In [181]:
x_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
29,female,group D,master's degree,standard,none,0.030791,0.434053
535,female,group C,bachelor's degree,free/reduced,completed,0.930290,0.964701
695,female,group D,some college,free/reduced,none,1.345443,1.163694
557,male,group C,master's degree,free/reduced,none,-0.176786,-0.162925
836,male,group E,high school,standard,none,-0.384363,-0.759904
...,...,...,...,...,...,...,...
106,female,group D,master's degree,standard,none,2.106557,2.092328
270,male,group C,bachelor's degree,standard,none,-0.453555,-0.494580
860,female,group C,associate's degree,standard,none,-0.522747,-1.025228
435,male,group C,some college,free/reduced,completed,-1.491438,-1.025228


In [182]:
x_test

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
521,female,group C,associate's degree,standard,none,1.137866,1.031032
737,female,group B,some college,free/reduced,completed,-0.245978,0.301391
740,male,group D,bachelor's degree,standard,none,0.238367,0.235060
660,male,group C,some college,free/reduced,none,0.515136,0.301391
411,male,group E,some college,standard,completed,0.930290,0.633046
...,...,...,...,...,...,...,...
408,female,group D,high school,free/reduced,completed,-0.868708,-0.826235
332,male,group E,associate's degree,standard,completed,-0.937901,-1.025228
208,female,group B,some college,free/reduced,none,0.791905,0.500384
613,female,group C,associate's degree,standard,none,0.515136,0.367722


In [183]:
x_train["test_preparation_course"].value_counts()

test_preparation_course
none         521
completed    279
Name: count, dtype: int64

In [184]:
x_train["test_preparation_course"] = x_train["test_preparation_course"].replace("none",0)
x_train["test_preparation_course"] = x_train["test_preparation_course"].replace("completed",1)
x_test["test_preparation_course"] = x_test["test_preparation_course"].replace("none",0)
x_test["test_preparation_course"] = x_test["test_preparation_course"].replace("completed",1)

In [185]:
name = cat_cols[-1]
cat_cols.remove(name)
con_cols.append(name)

In [186]:
cat_cols

['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch']

In [187]:
con_cols

['reading_score', 'writing_score', 'test_preparation_course']

In [188]:
x_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
29,female,group D,master's degree,standard,0,0.030791,0.434053
535,female,group C,bachelor's degree,free/reduced,1,0.930290,0.964701
695,female,group D,some college,free/reduced,0,1.345443,1.163694
557,male,group C,master's degree,free/reduced,0,-0.176786,-0.162925
836,male,group E,high school,standard,0,-0.384363,-0.759904
...,...,...,...,...,...,...,...
106,female,group D,master's degree,standard,0,2.106557,2.092328
270,male,group C,bachelor's degree,standard,0,-0.453555,-0.494580
860,female,group C,associate's degree,standard,0,-0.522747,-1.025228
435,male,group C,some college,free/reduced,1,-1.491438,-1.025228


In [189]:
oe_train = pd.DataFrame(oe.fit_transform(x_train[cat_cols]).toarray(), index = x_train[cat_cols].index)
oe_test = pd.DataFrame(oe.transform(x_test[cat_cols]).toarray(),index=x_test[cat_cols].index)

In [190]:
oe_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
29,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
535,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
695,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
557,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
836,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
270,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
860,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
435,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [191]:
oe_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
521,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
740,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
660,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
411,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
332,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
208,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
613,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [192]:
train_final = pd.concat([x_train[con_cols],oe_train],axis=1)
test_final = pd.concat([x_test[con_cols],oe_test],axis = 1)

In [193]:
train_final

Unnamed: 0,reading_score,writing_score,test_preparation_course,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
29,0.030791,0.434053,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
535,0.930290,0.964701,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
695,1.345443,1.163694,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
557,-0.176786,-0.162925,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
836,-0.384363,-0.759904,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,2.106557,2.092328,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
270,-0.453555,-0.494580,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
860,-0.522747,-1.025228,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
435,-1.491438,-1.025228,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [194]:
test_final

Unnamed: 0,reading_score,writing_score,test_preparation_course,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
521,1.137866,1.031032,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737,-0.245978,0.301391,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
740,0.238367,0.235060,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
660,0.515136,0.301391,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
411,0.930290,0.633046,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,-0.868708,-0.826235,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
332,-0.937901,-1.025228,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
208,0.791905,0.500384,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
613,0.515136,0.367722,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [195]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42)

In [196]:
train_final.columns = train_final.columns.astype(str)

In [197]:
rfr.fit(train_final,y_train)

  return fit_method(estimator, *args, **kwargs)


In [198]:
test_final.columns  = test_final.columns.astype(str)

In [199]:
y_pred = rfr.predict(test_final)


In [200]:
y_pred.shape

(200,)

In [201]:
y_test.shape

(200, 1)

In [202]:
from sklearn.metrics import r2_score

r2_score(y_test,y_pred)

0.8493695364984172