# Feature Selection Code

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv("Datasets/train.csv")

In [3]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [5]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [6]:
X.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [7]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [8]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [9]:
X[:, :5]

array([[-0.90259726, -0.9900495 ,  0.83077942, -1.01918398, -0.76249466],
       [-0.49513857,  1.0100505 , -1.2530642 ,  0.98117712, -0.99289039],
       [-1.5376865 ,  1.0100505 , -1.2530642 ,  0.98117712, -0.53209893],
       ...,
       [ 1.53077336, -0.9900495 , -0.76274805,  0.98117712, -0.76249466],
       [ 0.62252745, -0.9900495 , -0.76274805, -1.01918398, -0.07130748],
       [-1.65833069,  1.0100505 ,  0.58562134,  0.98117712,  0.15908825]])

In [10]:
X = pd.DataFrame(X, columns=data.iloc[:, :-1].columns)
X.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,-0.902597,-0.99005,0.830779,-1.019184,-0.762495,-1.043966,-1.380644,0.34074,1.349249,-1.101971,-1.30575,-1.408949,-1.146784,0.391703,-0.784983,0.283103,1.462493,-1.786861,-1.006018,0.986097
1,-0.495139,1.010051,-1.253064,0.981177,-0.99289,0.957886,1.155024,0.687548,-0.120059,-0.664768,-0.645989,0.585778,1.704465,0.467317,1.114266,-0.635317,-0.734267,0.559641,0.994018,-1.014099
2,-1.537686,1.010051,-1.253064,0.981177,-0.532099,0.957886,0.493546,1.381165,0.134244,0.209639,-0.645989,1.392684,1.074968,0.441498,-0.310171,-0.864922,-0.36814,0.559641,0.994018,-1.014099
3,-1.419319,1.010051,1.198517,-1.019184,-0.99289,-1.043966,-1.215274,1.034357,-0.261339,0.646842,-0.151168,1.28675,1.236971,0.594569,0.876859,0.512708,-0.002014,0.559641,-1.006018,-1.014099
4,1.325906,1.010051,-0.395011,-1.019184,2.002254,0.957886,0.658915,0.34074,0.02122,-1.101971,0.673534,1.268718,-0.091452,-0.657666,-1.022389,-0.864922,0.73024,0.559641,0.994018,-1.014099


In [11]:
best_features = SelectKBest()

fitted = best_features.fit(X, y)
fitted.scores_

array([3.15981575e+01, 4.76767709e-01, 4.93707801e-01, 4.28239286e-01,
       7.72181960e-01, 1.05952453e+00, 2.92299608e+00, 1.50068244e+00,
       3.59431819e+00, 2.62541515e+00, 8.25446358e-01, 1.94848418e+01,
       2.26208825e+01, 3.52011082e+03, 2.22598374e+00, 1.67099983e+00,
       1.62881131e+00, 4.57319750e-01, 1.29330223e+00, 2.84940470e-01])

In [12]:
feature_scores = pd.DataFrame(fitted.scores_, index=data.iloc[:, :-1].columns).reset_index().rename(columns = {'index': 'features',
                                                                                                      0: 'score'})
feature_scores

Unnamed: 0,features,score
0,battery_power,31.598158
1,blue,0.476768
2,clock_speed,0.493708
3,dual_sim,0.428239
4,fc,0.772182
5,four_g,1.059525
6,int_memory,2.922996
7,m_dep,1.500682
8,mobile_wt,3.594318
9,n_cores,2.625415


In [13]:
feature_scores = feature_scores.sort_values(by='score', ascending=False)
feature_scores

Unnamed: 0,features,score
13,ram,3520.110824
0,battery_power,31.598158
12,px_width,22.620882
11,px_height,19.484842
8,mobile_wt,3.594318
6,int_memory,2.922996
9,n_cores,2.625415
14,sc_h,2.225984
15,sc_w,1.671
16,talk_time,1.628811


In [14]:
top_10 = feature_scores['features'].head(10).to_list()
print(top_10)

['ram', 'battery_power', 'px_width', 'px_height', 'mobile_wt', 'int_memory', 'n_cores', 'sc_h', 'sc_w', 'talk_time']


## Model Comparison

In [15]:
model = LogisticRegression()

In [16]:
model.fit(X,y)

In [17]:
scores = cross_val_score(model, X,y,cv = 10)

In [18]:
scores

array([0.955, 0.98 , 0.95 , 0.96 , 0.975, 0.95 , 0.965, 0.955, 0.97 ,
       0.96 ])

In [19]:
scores.mean()

0.9620000000000001

In [20]:
X[top_10]

Unnamed: 0,ram,battery_power,px_width,px_height,mobile_wt,int_memory,n_cores,sc_h,sc_w,talk_time
0,0.391703,-0.902597,-1.146784,-1.408949,1.349249,-1.380644,-1.101971,-0.784983,0.283103,1.462493
1,0.467317,-0.495139,1.704465,0.585778,-0.120059,1.155024,-0.664768,1.114266,-0.635317,-0.734267
2,0.441498,-1.537686,1.074968,1.392684,0.134244,0.493546,0.209639,-0.310171,-0.864922,-0.368140
3,0.594569,-1.419319,1.236971,1.286750,-0.261339,-1.215274,0.646842,0.876859,0.512708,-0.002014
4,-0.657666,1.325906,-0.091452,1.268718,0.021220,0.658915,-1.101971,-1.022389,-0.864922,0.730240
...,...,...,...,...,...,...,...,...,...,...
1995,-1.342799,-1.011860,1.477661,1.300273,-0.967737,-1.656260,0.646842,0.164641,-0.405712,1.462493
1996,-0.085031,1.653694,1.651235,0.608317,1.320993,0.383299,-0.227564,-0.310171,0.971917,0.913303
1997,0.860139,1.530773,0.880565,0.502383,-0.911225,0.217930,1.521249,-0.784983,-1.094526,-1.100394
1998,-1.157454,0.622527,-1.345816,-0.696707,0.134244,0.769162,0.209639,1.351672,0.971917,1.462493


In [21]:
scores = cross_val_score(model, X[top_10], y,cv = 10)

In [22]:
scores

array([0.97 , 0.98 , 0.965, 0.955, 0.995, 0.94 , 0.985, 0.975, 0.98 ,
       0.965])

In [23]:
scores.mean()

0.9710000000000001