In [None]:
from sklearn.datasets import make_classification, make_moons, load_iris, make_circles
from sklearn.decomposition import PCA, KernelPCA
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
import utils
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# Example: Arbitrary classification

In [None]:
X, y = make_classification(n_informative=2, random_state=11)
print(X.shape)

## Training a simple logistic regression model

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
prediction = lr.predict(xtest)
print("F1 score: %f" % f1_score(ytest, prediction))

## Reducing dimensionality with PCA

In [None]:
pca = PCA(n_components=2)
x_red = pca.fit_transform(X)
print(x_red.shape)

## Training on reduced dimensions

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x_red, y, random_state=2)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
prediction = lr.predict(xtest)
print("F1 score: %f" % f1_score(ytest, prediction))

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(x_red[:, 0], x_red[:, 1], c=y)

## Exercise:
## 1. Plot top 2 principal components of the iris dataset (already provided below)
## 2. Use LinearSVC to train on full iris dataset and on PCA of iris dataset, check the difference in F1 score.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [None]:
# enter code here

## How to select the number of principal components?
### The ratio of explained variance

In [None]:
X, y = make_classification(n_redundant=10, random_state=11)
pca = PCA(n_components=X.shape[1])
pca.fit(X)
varRatio = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(8, 6))
plt.plot(np.arange(1, X.shape[1] + 1), varRatio)
_ = plt.xticks(np.arange(1, X.shape[1] + 1))

### ≈ "Rank" of the matrix

In [None]:
np.linalg.matrix_rank(X)

## Exercise: Find the rank of the following matrix, and find the number of informative principal components

In [None]:
X = utils.make_varratio_exercise()

In [None]:
# enter code here

# Univariate Feature Selection
## Removing features with Low Variance

In [None]:
X = np.random.normal(size=(1000, 50))

In [None]:
plt.figure(figsize=(8, 6))
_ = plt.hist(X.var(0))

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
vt = VarianceThreshold(threshold=1.0)

In [None]:
xThresh = vt.fit_transform(X)
plt.figure(figsize=(8, 6))
_ = plt.hist(xThresh.var(0))

## Which features were kept?

In [None]:
np.arange(X.shape[1])[vt.get_support()]

## Selecting the `k` "best" features from a dataset

### Perform a test that measures the statistical significance of every feature wrt target, and select the `k` highest such features
### Types of significance tests:
* <h4>$\chi^2$ test</h4>
* <h4>Mutual information</h4>
* <h4>ANOVA F-value</h4>
* <h4>$p$-value of correlation coefficients</h4>

### See [this](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest) for more

In [None]:
X = iris.data
y = iris.target
lr = LogisticRegression()
lr.fit(X, y)
print("f1 score:",  f1_score(y, lr.predict(X), average='weighted'))

In [None]:
selector = SelectKBest(chi2, k=2)
xNew = selector.fit_transform(X, y)
print(xNew.shape)
lr.fit(xNew, y)
print("f1 score:",  f1_score(y, lr.predict(xNew), average='weighted'))

## Exercise
* ### Fit a default Logistic regression on the following dataset and report the f1 score
* ### For each possible value of $k$, fit a logistic regression and find the f1 score.
* ### Plot the scores vs. number of dimensions selected
* ### Hint: use the `f_classif` selector instead of `chi2`

In [None]:
X, y = make_classification(n_informative=3, n_redundant=10, random_state=5)
print(X.shape)

In [None]:
# enter code here