# Feature selection

In [None]:
"""
f1 f2 f3 f4 ..........f50
15 relevant features u have to select from total 50 features
Feature selection is used to do this task.
bcz it will select only relevant features
whereas irrelevant , noisy features will not be selected.
""""

In [None]:
# feature selection
# feature selection we select most important features
# Supervised learning algorithms
    # filter method
    # wrapper method
    # feature imporatance method

### https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection

In [None]:
# why feature selection is imporatant?
# Ans: https://towardsdatascience.com/feature-selection-with-pandas-e3690ad8504b

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/content/mobile_data.csv')

In [4]:
df.head(2)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2


In [5]:
df.shape

(2000, 21)

In [6]:
df.price_range.unique()

array([1, 2, 3, 0])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [8]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [9]:
# Create input + output segment
x = df.iloc[:,:-1]

In [10]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [11]:
y = df['price_range']
y[:3]

0    1
1    2
2    2
Name: price_range, dtype: int64

In [12]:
# check data is balanced or not
y.value_counts()

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64

## Create a ML model to predict price range of a mobile phone on the basis of input features

# information gain

In [13]:
from sklearn.feature_selection import mutual_info_classif

In [14]:
score = mutual_info_classif(x,y)
# model will return mutual_info score

In [15]:
score

array([2.40999337e-02, 6.86603179e-05, 0.00000000e+00, 2.38241464e-02,
       0.00000000e+00, 0.00000000e+00, 6.17162918e-03, 0.00000000e+00,
       2.62290006e-03, 1.28011305e-03, 0.00000000e+00, 2.98376893e-02,
       3.01367604e-02, 8.47148807e-01, 1.82406027e-02, 0.00000000e+00,
       0.00000000e+00, 1.95031378e-02, 0.00000000e+00, 0.00000000e+00])

In [16]:
len(score) # Number of features means len of x.columns (inputs)

20

In [None]:
score # score is  a mutual_info_score for 20 Features

In [17]:
imp_fea = pd.DataFrame(score,columns=['scores'])

In [18]:
imp_fea

Unnamed: 0,scores
0,0.0241
1,6.9e-05
2,0.0
3,0.023824
4,0.0
5,0.0
6,0.006172
7,0.0
8,0.002623
9,0.00128


In [19]:
df_columns = pd.DataFrame(df.columns,columns=['features'])
df_columns

Unnamed: 0,features
0,battery_power
1,blue
2,clock_speed
3,dual_sim
4,fc
5,four_g
6,int_memory
7,m_dep
8,mobile_wt
9,n_cores


In [20]:
new_feat = pd.concat([imp_fea,df_columns],axis = 1)

In [21]:
new_feat

Unnamed: 0,scores,features
0,0.0241,battery_power
1,6.9e-05,blue
2,0.0,clock_speed
3,0.023824,dual_sim
4,0.0,fc
5,0.0,four_g
6,0.006172,int_memory
7,0.0,m_dep
8,0.002623,mobile_wt
9,0.00128,n_cores


In [22]:
# lets select only 10 columns outof 20 using nlargest and
#specify column name to sort
new_feat = new_feat.nlargest(10,'scores')

In [23]:
new_feat

Unnamed: 0,scores,features
13,0.847149,ram
12,0.030137,px_width
11,0.029838,px_height
0,0.0241,battery_power
3,0.023824,dual_sim
17,0.019503,three_g
14,0.018241,sc_h
6,0.006172,int_memory
8,0.002623,mobile_wt
9,0.00128,n_cores


In [None]:
new_feat.plot(kind='barh',x='features',y='scores')

# select k best

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2

In [None]:
x.shape

In [None]:
X_new = SelectKBest(chi2, k=10).fit_transform(x, y)

In [None]:
X_new.shape

In [None]:
pd.DataFrame(X_new)

In [None]:
feat_imp = SelectKBest(score_func=f_classif,k = 10)

In [None]:
feat_imp.fit(x,y)

In [None]:
feat_imp.n_features_in_

In [None]:
feat_imp.scores_

In [None]:
df_imp = pd.DataFrame(feat_imp.scores_,columns=['scores'])

In [None]:
df1 = pd.concat([df_imp,df_columns],axis=1)

In [None]:
df1

In [None]:
final = df1.nlargest(10,'scores')
final

In [None]:
final.features.tolist()

In [None]:
#x.loc[:,[final.features.values]]
#x.loc[:,['blue','sc_h']]
x_new = x.loc[:,final.features.tolist()]
x_new.head(2)

In [None]:
x.head(2)

In [None]:
x.shape

In [None]:
x_new.info()

# correlation coef

In [None]:
df[:3]

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)

In [None]:
# ram vs price_range
plt.scatter(df.ram,df.price_range)

In [None]:
# battery_power vs int_mem
plt.scatter(df.battery_power,df.int_memory)

# feature importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
modl = ExtraTreesClassifier()

In [None]:
# lets create x and y
x = df.drop(columns='price_range')
y = df.price_range

In [None]:
modl.fit(x,y)

In [None]:
score = modl.feature_importances_
score

In [None]:
ranked_features = pd.Series(score,index=x.columns)
ranked_features

In [None]:
ranked_features.nlargest(10).plot(kind='barh')

# RFE(Recursive Feature elimination)

In [None]:
from sklearn.feature_selection import RFE

In [None]:
RFE?

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
# dt is estimator

In [None]:
x.head(2)

In [None]:
rf_selector = RFE(estimator=dt,n_features_to_select=10)

In [None]:
rf_selector.fit(x,y)

In [None]:
out = rf_selector.support_
out

In [None]:
rf_selector.ranking_

In [None]:
new_x = x.loc[:,out]
new_x

In [None]:
# lets try with other estimator
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
rf_selector_2 = RFE(estimator=rf,n_features_to_select=8)

In [None]:
rf_selector_2.fit(x,y)

In [None]:
sp = rf_selector_2.support_
sp

In [None]:
x.loc[:,sp]

In [None]:
cr = pd.read_csv('mtcars.csv')
cr.head()

In [None]:
# X,y separate
X = cr.iloc[:,1:]
X[:2]

In [None]:
X.shape

In [None]:
y = cr.mpg
y[:3]

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr =  LinearRegression()

In [None]:
rf_new = RFE(estimator=lr,n_features_to_select=5)
rf_new

## https://machinelearningmastery.com/rfe-feature-selection-in-python/

In [None]:
rf_new.fit(X,y)

In [None]:
rf_new.n_features_

In [None]:
rf_new.estimator_

In [None]:
rf_new.support_

In [None]:
rf_new.ranking_

In [None]:
rf_new.fit_transform(X,y)

In [None]:
X.loc[:,rf_new.support_]

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
sk = SelectKBest(score_func=mutual_info_regression,k=5)
sk

In [None]:
sk.fit(X,y)

In [None]:
sk.scores_

In [None]:
new_feat = sk.fit_transform(X,y)
new_feat

In [None]:
final = pd.concat([pd.DataFrame(sk.scores_,columns=['score']),pd.DataFrame(X.columns,columns=['features'])],axis=1)
final[:3]

In [None]:
final.nlargest(5,'score')