## feature selection

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
import numpy as np
np.set_printoptions(precision=2)

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.rename({'DiabetesPedigreeFunction':'pedigree'},axis=1,inplace=True)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
X= df.iloc[:,:-1]   #[: ,0:-1]
y = df['Outcome']
print(X.shape,y.shape)

(768, 8) (768,)


In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
featSelector = SelectKBest(chi2,k=5)
featSelector.fit(X,y)
print(featSelector.scores_)
print(featSelector.get_feature_names_out())

[ 111.52 1411.89   17.61   53.11 2175.57  127.67    5.39  181.3 ]
['Pregnancies' 'Glucose' 'Insulin' 'BMI' 'Age']


In [9]:
features = featSelector.transform(X)
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain, xtest, ytrain, ytest = train_test_split(scaledX,y,test_size=.2,random_state=1)
# xtrain.shape,xtest.shape
m = KNeighborsClassifier()
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm = confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[80 19]
 [21 34]]
              precision    recall  f1-score   support

           0       0.79      0.81      0.80        99
           1       0.64      0.62      0.63        55

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154



### wrapper method 

In [10]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [11]:
clf = LogisticRegression(solver='liblinear')
rfe = RFE(clf)
rfe.fit(X,y)

RFE(estimator=LogisticRegression(solver='liblinear'))

In [12]:
rfe.get_feature_names_out()

array(['Pregnancies', 'Glucose', 'BMI', 'pedigree'], dtype=object)

In [13]:
features = X[rfe.get_feature_names_out()] # changed
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain, xtest, ytrain, ytest = train_test_split(scaledX,y,test_size=.2,random_state=1)
# xtrain.shape,xtest.shape
m = KNeighborsClassifier()
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm = confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[85 14]
 [23 32]]
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        99
           1       0.70      0.58      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.75       154



In [14]:
from joblib import dump

In [15]:
model_dict={
    'title':'Diabetes Prediction model',
    'classifier':m,
    'scaler':scaler,
}

dump(model_dict, "fs_ap.pkl")

['fs_ap.pkl']