# Anomaly Detection
Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations(it is an inlier), or should be considered as different(it is an outlier). Often, this ability is used to clean real data sets. 
Two important distinction must be made:
### Novelty detection:
 	The training data is not polluted by outliers, and we are interested in detecting anomalies in new observations.

### Outlier detection:
    The training data contains outliers, and we need to fit the central mode of the training data, ignoring the deviant observations.

In [None]:
import pandas
import numpy

# need this for using pandas built-in plotting facility
import matplotlib.pyplot as plt
%matplotlib inline

# please visit 'http://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py'
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier


pandas.set_option('display.max_rows', 10)
pandas.set_option('display.max_columns', 10)

# set a fixed seed for numpy pseudo random generator
numpy.random.seed(100)

# build an mm scaler for later use
scaler = MinMaxScaler(feature_range=(-1, 1))

In [None]:
# read a compressed csv file
data = pandas.read_csv("./datasets/creditcardfraud.zip", compression='zip', header=0,  sep=',')

# tell me how much memory 'data' is using?
data.memory_usage()/(2**20), data.shape

In [None]:
# let's peek into the data a always
data

# data.columns

In [None]:
# show me a crude descrition of the data
data.describe()

In [None]:
plt.figure()

# show me the historgram for 'Class'
data['Class'].plot.hist()

plt.show()

# how many 1s do we have in 'Class'?
data[data['Class'] == 1]

In [None]:
plt.figure()

subset_features = ['V1', 'V2', 'V3', 'V4', 'V5']

scaled_subset =pandas.DataFrame(
    scaler.fit_transform(data[subset_features]),
    columns=subset_features)

# show me the historgram for 'Class'
scaled_subset.iloc[0:50].plot.hist(stacked=True, 
                                    bins=10, alpha=0.8)

plt.show()

In [None]:
# print out the column names
data.columns

In [None]:
features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 
'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15',
'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 
'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

target = ['Class']

data[data['Class'] == numpy.nan]

In [None]:
data.Class.dropna(inplace=True)
X = data[features]
Y = data[target]

In [None]:
X_scaled = pandas.DataFrame(scaler.fit_transform(X), columns=features)

# first column is not sclaed, the second one is scaled
pandas.concat([X.loc[0:100, 'V6'],
               X_scaled.loc[0:100, 'V6']], axis="columns")

In [None]:
# Using an RFE object to rank each feature
cls = GradientBoostingClassifier()
rfe = RFE(estimator=cls, n_features_to_select=3, step=1.0)
margin = 1000
rfe.fit(X_scaled.iloc[0:margin], Y.iloc[0:margin])

# less is better(1 is the best)
rfe.ranking_
#rfe.n_features_
#rfe.estimator_
#rfe.get_support()

# make a dictionay object in sahpe of {'feature': 'rank'}
rank = dict(zip(features, rfe.ranking_))
rank

#cls.feature_importances_

In [None]:
X.Amount.describe()
data[(X.Amount > 10) & (Y.Class == 1)]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# create test and train data
selected_features = ['V12', 'V23', 'Amount']
X_train, X_test, y_train, y =\
train_test_split(X[selected_features], Y, test_size=0.1)
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_hat = rfc.predict(X_test)

In [None]:
# show me the report
print(metrics.classification_report(y_hat, y))

# Precision, Recall, F Score, Support,  Accuracy (In Binary Classification)

![Precion and Recall](https://upload.wikimedia.org/wikipedia/commons/2/26/Precisionrecall.svg)

![F_x Score](https://wikimedia.org/api/rest_v1/media/math/render/svg/49d1ff4917ee4c464f6efbee08735b4a8694e8c0)

<!-- ![TN, TP, FP ansd FN](https://upload.wikimedia.org/wikipedia/commons/6/65/Binary-classification-labeled.svg)
-->

![Accuracy](https://wikimedia.org/api/rest_v1/media/math/render/svg/e2e427ec6dcf2d7882c3bbdc659a8204cba59dcc)

In [None]:
# very very suspicious!
print(metrics.accuracy_score(y_hat, y))

In [None]:
# data is very unbiased it seems!
a = data[data.Class == 1].shape[0]
b = data.shape[0]
a/b