# Univariate Selection 
## using the chi-squared (chi²) statistical test 
The scikit-learn library provides the SelectKBest

chi-squared (chi²) : https://en.wikipedia.org/wiki/Chi-squared_test

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

## data dictionary
source: https://www.kaggle.com/uciml/pima-indians-diabetes-database  
* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age: Age (years)
* Outcome: Class variable (0 or 1) 268 of 768 are 1, the others are 0

In [None]:
url='https://github.com/mathawanup/basic_dataset/raw/master/diabetes.csv'
df=pd.read_csv(url)
df.head()

In [None]:
df.info()

## sklearn: SelectKBest

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
cols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
X=df[cols]
y=df['Outcome']
test_size=.3
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=test_size, 
                                                    stratify=y,
                                                    random_state=7)


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#apply SelectKBest class to extract top 10 best features
model = SelectKBest(score_func=chi2,k=8)

In [None]:
model = model.fit(X,y)

In [None]:
scores = pd.DataFrame(model.scores_)
columns = pd.DataFrame(X.columns)

In [None]:
#concat two dataframes 
featureScores = pd.concat([columns,scores],axis=1)

In [None]:
featureScores.columns = ['feature','Score']  #naming the dataframe columns


In [None]:
print(featureScores.nlargest(4,'Score'))  #print 5 best features

In [None]:
cols_sel=['Glucose',  'Insulin','BMI',  'Age']
X=df[cols_sel]
y=df['Outcome']
test_size=.3
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=test_size, 
                                                    stratify=y,
                                                    random_state=7)

In [None]:
X_train.head()

In [None]:
X_test.head()

### run model with selected features

In [None]:
model_sel= DecisionTreeClassifier()
model_sel.fit(X_train, y_train)

In [None]:
score_sel=model_sel.score(X_test, y_test)
score_sel

### run model with all features

In [None]:
X=df[cols]
y=df['Outcome']
test_size=.3
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=test_size, 
                                                    stratify=y,
                                                    random_state=7)

In [None]:
X_train.head()

In [None]:
model= DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
score=model.score(X_test, y_test)
score

---