## Dimensionality reduction & Outlier detection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, RocCurveDisplay, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 50
pd.options.display.max_rows = 200

#### Data provided is from a small sample of a real world [dataset](./income.csv) that contains 2 classes of observations regarding individuals’ Income. Those classes are:  `>=50k$` and `<50k$`. 

In [None]:
df = pd.read_csv('income.csv')
df.shape

In [None]:
(df.isna() == True).any().value_counts()

In [None]:
df.sample(5)

In [None]:
df.replace('<=50K', 0, inplace=True)

In [None]:
df.replace('>50K', 1, inplace=True)

In [None]:
# check for imbalance
df['class'].value_counts()

#### Luckily enough, data is already scaled for you and all categorical features are transformed using binary dummies so you can focus on your single task for the day:

#### Compare the performance of three of your favorite classifiers on the original data. Then use PCA, TSNE and SVD to transform your data and find what performs better for you.

* **Recommendation 1**: While using PCA and SVD transform your data in such a number of dimensions able to explain more than 99% of target’s variance.

* **Recommendation 2**: Use grid search to optimize your classifiers. Remember that after changing the dimensions of your space you may also have to also adjust your hyperparameters. Gridsearch can choose for you each time you repeat the process.

* **Recommendation 3**: Because of the stochastic elements in the process to fairly compare algorithms you better use the mean of more than one iterations.

In [None]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

In [None]:
clfs = [RandomForestClassifier(), KNeighborsClassifier(), SVC()]

In [None]:
pipe = Pipeline([

    ('clf', 'passthrough'),
])


parameters = [
    {
        'clf': (SVC(probability=True),),
        'clf__C': [0.01, 8.3, 10],
        'clf__kernel': ('rbf', 'sigmoid',)
    }, 
    {

        'clf': (RandomForestClassifier(),),
        'clf__n_estimators': [100, 1000],
        'clf__max_features': ['auto'],
        'clf__criterion': ['gini', 'entropy']
        
    }, 

]

grid_search = GridSearchCV(pipe, parameters, cv=3, scoring=['balanced_accuracy', 'f1_macro', 'roc_auc', 'accuracy'], verbose=4,
                           n_jobs=-1, refit='roc_auc')

In [None]:
result = grid_search.fit(X_train, y_train)
report = pd.DataFrame(result.cv_results_)
report.sort_values(by='mean_test_roc_auc', ascending=False).drop_duplicates('param_clf')

In [None]:
report.sort_values(by='mean_test_accuracy', ascending=False).drop_duplicates('param_clf')

#### It seems that the RF classifier gives the highest roc_auc score and also one of the highest accuracy scores. Let's plot the confusion matrix

In [None]:
y_pred = result.predict(X_test)
y_pred_proba = result.predict_proba(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

In [None]:
RocCurveDisplay.from_predictions(y_test, y_pred_proba[:,1])

In [None]:
print(classification_report(y_test, y_pred))

### Principal Component Analysis

In [None]:
pca = PCA(n_components=None, svd_solver='full')
pca.fit(X)
pca_cum = pca.explained_variance_ratio_.cumsum()
plt.plot(pca_cum)

# print(pca.singular_values_)

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

find_nearest(pca_cum, 0.99)

#### So the 99% of the data is up to index 110 of the transformed matrix. Let's keep only those columns

In [None]:
X_pca = pca.transform(X)[:, :111]

In [None]:
X_pca.shape, df.shape

#### We went from 500 dimensions to 111 dimensions!

Repeat now the same pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.20, random_state=5)

In [None]:
pipe = Pipeline([

    ('clf', 'passthrough'),
])

parameters = [
    {
        'clf': (SVC(),),
        'clf__C': [0.01, 10, 100],
        'clf__kernel': ('rbf', 'sigmoid',)
    }, 
    {

        'clf': (RandomForestClassifier(),),
        'clf__n_estimators': [100, 1000],
        'clf__max_features': ['auto'],
        'clf__criterion': ['gini', 'entropy']
    }, 

]

grid_search = GridSearchCV(pipe, parameters, cv=3, scoring=['balanced_accuracy', 'f1_macro', 'roc_auc', 'accuracy'], verbose=4,
                           n_jobs=-1, refit='roc_auc')

In [None]:
result = grid_search.fit(X_train, y_train)
report = pd.DataFrame(result.cv_results_)
report.sort_values(by='mean_test_roc_auc', ascending=False).drop_duplicates('param_clf')

### Singular Value Decomposition

In [None]:
svd = TruncatedSVD(n_components=400)
svd.fit(X)
svd_cum = svd.explained_variance_ratio_.cumsum()
plt.plot(svd_cum)

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

find_nearest(svd_cum, 0.99)

In [None]:
X_svd = svd.transform(X)[:, :111]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size=0.20, random_state=5)

In [None]:
result = grid_search.fit(X_train, y_train)
report = pd.DataFrame(result.cv_results_)
report.sort_values(by='mean_test_roc_auc', ascending=False).drop_duplicates('param_clf')

#### Bonus: Compare how Isolation Forest and Local Outlier Factor methods impact your three classifiers.

In [None]:
clf = LocalOutlierFactor(n_neighbors=5)
df['outlier'] = clf.fit_predict(df)

In [None]:
df.outlier

In [None]:
df_out = df[df.outlier != -1]

In [None]:
X = df_out.iloc[:, 1:]
y = df_out.iloc[:, 0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

In [None]:
pipe = Pipeline([

    ('clf', 'passthrough'),
])

parameters = [
    {
        'clf': (SVC(),),
        'clf__C': [0.01, 8.3, 100],
        'clf__kernel': ('rbf', 'sigmoid',),
        'clf__class_weight': ('balanced', None,)
        
    }, 
    {

        'clf': (RandomForestClassifier(),),
        'clf__n_estimators': [100, 1000],
        'clf__max_features': ['auto'],
        'clf__criterion': ['gini', 'entropy']
        
    },
        {

        'clf': (KNeighborsClassifier(),),
        'clf__n_neighbors': [3, 5],
        'clf__leaf_size': [30, 50],
        
    }, 
]

grid_search = GridSearchCV(pipe, parameters, cv=3, scoring=['roc_auc', 'accuracy'], verbose=4,
                           n_jobs=-1, refit='roc_auc')

In [None]:
result = grid_search.fit(X_train, y_train)
report = pd.DataFrame(result.cv_results_)
report.sort_values(by='mean_test_roc_auc', ascending=False).drop_duplicates('param_clf')

In [None]:
report.sort_values(by='mean_test_accuracy', ascending=False).drop_duplicates('param_clf')

In [None]:
clf = IsolationForest(n_estimators=100)
df['outlier'] = clf.fit_predict(df)

In [None]:
df_out = df[df.outlier != -1]

In [None]:
X = df_out.iloc[:, 1:]
y = df_out.iloc[:, 0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

In [None]:
pipe = Pipeline([

    ('clf', 'passthrough'),
])

parameters = [
    {
        'clf': (SVC(),),
        'clf__C': [0.01, 8.3, 100],
        'clf__kernel': ('rbf', 'sigmoid',),
        'clf__class_weight': ('balanced', None,)
        
    }, 
    {

        'clf': (RandomForestClassifier(),),
        'clf__n_estimators': [100, 1000],
        'clf__max_features': ['auto'],
        'clf__criterion': ['gini', 'entropy']
        
    },
        {

        'clf': (KNeighborsClassifier(),),
        'clf__n_neighbors': [3, 5],
        'clf__leaf_size': [30, 50],
        
    }, 
]

grid_search = GridSearchCV(pipe, parameters, cv=3, scoring=['roc_auc', 'accuracy'], verbose=4,
                           n_jobs=-1, refit='roc_auc')

In [None]:
result = grid_search.fit(X_train, y_train)
report = pd.DataFrame(result.cv_results_)
report.sort_values(by='mean_test_roc_auc', ascending=False).drop_duplicates('param_clf')

In [None]:
report.sort_values(by='mean_test_accuracy', ascending=False).drop_duplicates('param_clf')

In [None]:
y_pred = result.predict(X_test)
y_pred_proba = result.predict_proba(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

In [None]:
RocCurveDisplay.from_predictions(y_test, y_pred_proba[:,1])