In [8]:
import pandas as pd
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif

def logReg(data,target,features):
    x=data.filter(items=features)
    y=data[target].copy()
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.25, random_state=123)
    model=LogisticRegression()
    model.fit(xTrain,yTrain)
    yPred = pd.Series(model.predict(xTest))
    yTest = yTest.reset_index(drop=True)
    z = pd.concat([yTest, yPred], axis=1)
    z.columns = ['True', 'Prediction']
    print("Precision:", precision_score(yTest, yPred))
    print("Recall:", recall_score(yTest, yPred))
    print("F1:", f1_score(yTest, yPred))

df=pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=["Pregnancies", "Glucose", "BloodPressure", "SkinThickness","Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"])
print(df)
print("--------")
print(df.shape)
print("--------")
#convert to numpy array
vArray=df.values
#seperate into x and y, where y=outcome
x=vArray[:,0:8]
y=vArray[:,8]
#feature based extration with selection method
features=pd.Series(df.drop("Outcome", axis=1).columns)
for kVal in range(3,6):
    select=SelectKBest(score_func=f_classif, k=kVal)
    z=select.fit_transform(x,y)
    cols=select.get_support()
    print("K=",kVal," Features: ", features.loc[cols].values)
    print("--------")

#k=3 --> ['Glucose' 'BMI' 'Age']
#k=4 --> ['Pregnancies' 'Glucose' 'BMI' 'Age']
#k=5 --> ['Pregnancies' 'Glucose' 'BMI' 'DiabetesPedigreeFunction' 'Age']
#Case 1
print("\nCase 1 k=3")
logReg(df, "Outcome", ['Glucose','BMI']) #k=3
print("--------")
print("Case 1 k=4")
logReg(df, "Outcome", ['Glucose','Pregnancies']) #k=4
print("--------")
print("Case 1 k=5")
logReg(df, "Outcome", ['DiabetesPedigreeFunction','Age']) #k=5
print("--------\n")

#Case 2
print("Case 2 k=3")
logReg(df, "Outcome", ['DiabetesPedigree','Pregnancies']) #k=3
print("--------")
print("Case 2 k=4")
logReg(df, "Outcome", ['Insulin','DiabetesPedigreeFunction']) #k=4
print("--------")
print("Case 2 k=5")
logReg(df, "Outcome", ['Insulin','SkinThickness']) #k=5

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  