# Steps for Univariate feauture selection

<div  style="color:blue;font-family:Candara,arial,helvetica;line-height:20px"><strong>

## Step1. Get all independent Feautures
## Step2. Apply relevant statistical method
## Step3. Get P Value and compare with significance level
## Step4. Select feauture if P<A

</strong></div>    

# F-Test

<div  style="color:blue;font-family:Candara,arial,helvetica;line-height:20px"><strong>

## An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. It is most often used when comparing statistical models that have been fitted to a data set, in order to identify the model that best fits the population from which the data were sampled.

<img src="https://slideplayer.com/slide/781157/3/images/18/F-tests+The+test+for+joint+significance+has+its+own+formula%2C+which+takes+the+following+form%3A.jpg
" alt="drawing" width="600" height="300"/>  

</strong></div> 

## Linear Regression Without taking F Test into consideration

In [1]:
# Import libraries
import pandas as pd

# Read the file
f = pd.read_csv('Students2.csv')

# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]

# Perform Linear Regression using original dataset
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(x, y, test_size = 0.4, random_state = 1234)

lr.fit(X_train, Y_train)

y_predict = lr.predict(X_test)

# Calculate the RMSE error for the regression
from sklearn.metrics import mean_squared_error
import math

org_score = lr.score(X_test, Y_test)
org_rmse = math.sqrt(mean_squared_error(Y_test, y_predict))

print(org_score)
print(org_rmse)

0.842748598241573
6.982206715357427


## Perform F-Test

In [2]:
# import and perform the f_regression to get the F-Score and P-Values
from sklearn.feature_selection import f_regression as fr
result = fr(x,y)


# Split the result tuple into F_Score and P_Values
f_score = result[0]
p_values = result[1]


# Print the table of Features, F-Score and P-values
columns = list(x.columns)

print (" ")
print (" ")
print (" ")

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

 
 
 
    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086


## Performlinear regression with reduced feautures

In [3]:
# Perform the Linear Regression with reduced features
X_train_n = X_train[['Hours', 'sHours']]
X_test_n = X_test[['Hours', 'sHours']]

lr1 = LinearRegression()
lr1.fit(X_train_n, Y_train)

y_predict_n = lr1.predict(X_test_n)

# Calculate the RMSE with reduced features
new_score = lr1.score(X_test_n, Y_test)
new_rmse = math.sqrt(mean_squared_error(Y_test, y_predict_n))

print(new_score)
print(new_rmse)

0.9161939417709495
5.09721728108113


# Univariate Feauture Selection

## Import Library and Split into test train

In [4]:
# ----------------------------------------------------------------
# Implement various feature selection, Select Transforms
# ----------------------------------------------------------------

# Import pandas, read the file and split into X and Y
import pandas as pd
f = pd.read_csv('Students2.csv')
X = f.iloc[:, :-1]
Y = f.iloc[:,  -1]

# Import various select transforms along with the f_regression mode
from sklearn.feature_selection import SelectKBest,             \
                                      SelectPercentile,        \
                                      GenericUnivariateSelect, \
                                      f_regression

## KBest and print columns

In [5]:
# Implement and print SelectKBest
selectorK = SelectKBest(score_func=f_regression, k=3)
x_k = selectorK.fit_transform(X, Y)

# Get f_score and p_values for the selected features
f_score = selectorK.scores_
p_values = selectorK.pvalues_

# Print the f_score and p_values
# Print the table of Features, F-Score and P-values
columns = list(X.columns)

print (" ")
print (" ")
print (" ")

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

cols = selectorK.get_support(indices=True)
selectedCols = X.columns[cols].to_list()

print(selectedCols)
display(x_k)

 
 
 
    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086
['Hours', 'sHours', 'calories']


array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]], dtype=int64)

## # Implement SelectPercentile

In [6]:
# Implement SelectPercentile
selectorP = SelectPercentile(score_func=f_regression, percentile=50)
x_p = selectorP.fit_transform(X, Y)
display(x_p)

array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]], dtype=int64)

## Implement GenericUnivariateSelect with k_best

In [7]:
selectorG1 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='k_best',
                                     param=3)
x_g1 = selectorG1.fit_transform(X,Y)
display(x_g1)

array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]], dtype=int64)

In [8]:
# Implement GenericUnivariateSelect with percentile

In [9]:
# Implement GenericUnivariateSelect with percentile
selectorG2 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='percentile',
                                     param=50)
x_g2 = selectorG2.fit_transform(X,Y)
display(x_g2)

array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]], dtype=int64)