In [15]:
import numpy as np
import pandas as pd
from pathlib import Path

# Project root 3 levels up
project_root = Path.cwd().parents[2]


### Step-1: Loading dataset

In [5]:
data = pd.read_csv(project_root / "datasets/02_dimensionality_reduction/mobile.csv")
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

### Step-2: Normalisation

In [14]:
mu = X.mean(axis=0)
standard_deviation = X.std(axis=0)
X = (X-mu)/standard_deviation
X

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,-0.902372,-0.989802,0.830572,-1.018929,-0.762304,-1.043705,-1.380298,0.340654,1.348911,-1.101696,-1.305424,-1.408596,-1.146497,0.391605,-0.784787,0.283032,1.462128,-1.786414,-1.005767,0.985850
1,-0.495015,1.009798,-1.252751,0.980932,-0.992642,0.957646,1.154735,0.687376,-0.120029,-0.664602,-0.645827,0.585631,1.704039,0.467200,1.113987,-0.635158,-0.734084,0.559501,0.993769,-1.013846
2,-1.537302,1.009798,-1.252751,0.980932,-0.531966,0.957646,0.493422,1.380820,0.134210,0.209587,-0.645827,1.392336,1.074699,0.441387,-0.310094,-0.864705,-0.368048,0.559501,0.993769,-1.013846
3,-1.418964,1.009798,1.198217,-1.018929,-0.992642,-1.043705,-1.214970,1.034098,-0.261274,0.646681,-0.151130,1.286428,1.236662,0.594421,0.876640,0.512579,-0.002013,0.559501,-1.005767,-1.013846
4,1.325574,1.009798,-0.394912,-1.018929,2.001753,0.957646,0.658751,0.340654,0.021215,-1.101696,0.673365,1.268401,-0.091429,-0.657502,-1.022134,-0.864705,0.730057,0.559501,0.993769,-1.013846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1.011607,1.009798,-1.252751,0.980932,-0.992642,0.957646,-1.655845,1.034098,-0.967495,0.646681,0.673365,1.299948,1.477291,-1.342463,0.164600,-0.405610,1.462128,0.559501,0.993769,-1.013846
1996,1.653280,1.009798,1.320765,0.980932,-0.992642,-1.043705,0.383203,-1.046233,1.320663,-0.227507,-1.140525,0.608165,1.650822,-0.085010,-0.310094,0.971674,0.913075,0.559501,0.993769,0.985850
1997,1.530391,-0.989802,-0.762557,0.980932,-0.762304,0.957646,0.217875,0.687376,-0.910998,1.520869,-1.140525,0.502257,0.880345,0.859924,-0.784787,-1.094253,-1.100119,0.559501,0.993769,-1.013846
1998,0.622372,-0.989802,-0.762557,-1.018929,-0.071290,0.957646,0.768969,-1.392955,0.134210,0.209587,-0.810726,-0.696533,-1.345480,-1.157164,1.351334,0.971674,1.462128,0.559501,0.993769,0.985850


### Step-3: Selecting and extracting top k features from X

In [16]:
from sklearn.feature_selection import SelectKBest

In [23]:
select_k_best = SelectKBest()
select_k_best.fit(X, y)
df_scores = pd.DataFrame(select_k_best.scores_)
df_columns = pd.DataFrame(X.columns)

In [28]:
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ["Features", "Scores"]

In [29]:
feature_scores

Unnamed: 0,Features,Scores
0,battery_power,31.598158
1,blue,0.476768
2,clock_speed,0.493708
3,dual_sim,0.428239
4,fc,0.772182
5,four_g,1.059525
6,int_memory,2.922996
7,m_dep,1.500682
8,mobile_wt,3.594318
9,n_cores,2.625415


In [30]:
feature_scores.sort_values(by="Scores", ascending=False, inplace=True)
feature_scores

Unnamed: 0,Features,Scores
13,ram,3520.110824
0,battery_power,31.598158
12,px_width,22.620882
11,px_height,19.484842
8,mobile_wt,3.594318
6,int_memory,2.922996
9,n_cores,2.625415
14,sc_h,2.225984
15,sc_w,1.671
16,talk_time,1.628811


In [33]:
top_10_features = list(feature_scores[:10]["Features"].values)
print(top_10_features)

['ram', 'battery_power', 'px_width', 'px_height', 'mobile_wt', 'int_memory', 'n_cores', 'sc_h', 'sc_w', 'talk_time']


### Step-4: Model comparison

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#### Training

In [37]:
model = LogisticRegression()
model.fit(X,y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


#### Cross validation score of all features

In [38]:
scores = cross_val_score(model, X, y, cv=10)
scores.mean()

np.float64(0.9620000000000001)

#### Cross validation score of top 10 

In [39]:
scores_top_10 = cross_val_score(model, X[top_10_features], y, cv=10)
scores_top_10.mean()

np.float64(0.9710000000000001)