In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Anomaly Detection using Unsupervised Techniques

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#for data preprocessing
from sklearn.decomposition import PCA

#for modeling
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

#filter warnings
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [None]:
df = pd.read_csv("../input/healthcare-providers-data/Healthcare Providers.csv")
df.head()

In [None]:
df.columns

In [None]:
DropCols = ['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider','Street Address 1 of the Provider',
       'Street Address 2 of the Provider','Zip Code of the Provider',"HCPCS Code"]

In [None]:
df = df.drop(DropCols, axis = 1)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df["Entity Type of the Provider"].value_counts()

In [None]:
df["Average Submitted Charge Amount"].dtype

In [None]:
df["Average Submitted Charge Amount"].dtype

In [None]:
#Cleaning

def RemoveComma(x):
    return x.replace(",","")

df["Average Medicare Allowed Amount"] = pd.to_numeric(df["Average Medicare Allowed Amount"].apply(lambda x: RemoveComma(x)),
                                                             errors= "ignore")
df["Average Submitted Charge Amount"] = pd.to_numeric(df["Average Submitted Charge Amount"].apply(lambda x: RemoveComma(x)),
                                                       errors = "ignore")
df["Average Medicare Payment Amount"] = pd.to_numeric(df["Average Medicare Payment Amount"].apply(lambda x: RemoveComma(x)),
                                                       errors = "ignore")
df["Average Medicare Standardized Amount"] = pd.to_numeric(df["Average Medicare Standardized Amount"].apply(lambda x: RemoveComma(x)),
                                                             errors = "ignore")


In [None]:
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

def RemoveComma(x):
    return x.replace(",","")

def Preprocessing(data):
    
    
    #1.Imputing Missing Values

    data["Credentials of the Provider"] = data["Credentials of the Provider"].fillna(data["Credentials of the Provider"].mode()[0])
    data["Gender of the Provider"] = data["Gender of the Provider"].fillna(data["Gender of the Provider"].mode()[0])
    

   #2.Binary Encoding.

    
    BEcols = [var for var in data.columns if data[var].dtype == "O"]
    
    for col in BEcols:
        encoder = ce.BinaryEncoder(cols = [col])
        dfbin = encoder.fit_transform(data[col])
        data = pd.concat([data,dfbin], axis = 1)
        del data[col]

    #3. One-Hot-Encoding

#     data = pd.get_dummies(data,drop_first = True)
    
 
    #4. Standardization
 
    data_columns = data.columns
    std = StandardScaler()
    data = std.fit_transform(data)
    data = pd.DataFrame(data, columns = data_columns)
    
    return data


df = Preprocessing(df)

In [None]:
df.head()

In [None]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(n_estimators=300, max_samples='auto', 
                        contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, 
                        behaviour='deprecated', verbose=1, warm_start=False, random_state=2020)
model.fit(df)

In [None]:
Y = model.predict(df)

In [None]:
Y[Y == 1] = 0
Y[Y == -1] = 1

In [None]:
pca = PCA(2)
x_pca = pca.fit_transform(df)
x_pca = pd.DataFrame(x_pca)
x_pca.columns = ['pc1', 'pc2']

plt.figure(figsize=(12,8))
plt.title('Sample Distribution on First 2 PCAs by Class Color')
plt.scatter(x_pca['pc1'], x_pca['pc2'], alpha = .6, c=Y)
plt.show()

In [None]:
# for interactive visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff

In [None]:
pca = PCA(3)
x_pca = pca.fit_transform(df)
x_pca = pd.DataFrame(x_pca)
x_pca.columns = ['pc1', 'pc2', 'pc3']

In [None]:
x_pca["Label"] = Y

In [None]:
trace1 = go.Scatter3d(
    x= x_pca['pc1'],
    y= x_pca['pc2'],
    z= x_pca['pc3'],
    mode='markers',
     marker=dict(
        color = x_pca['Label'], 
        size= 10,
        line=dict(
            color= x_pca['Label'],
            width= 12
        ),
        opacity=0.8
     )
)
dt = [trace1]

layout = go.Layout(
    title = 'Character vs Gender vs Alive or not',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0  
    ),
    scene = dict(
            xaxis = dict(title  = 'PC1'),
            yaxis = dict(title  = 'PC2'),
            zaxis = dict(title  = 'PC3')
        )
)

fig = go.Figure(data = dt, layout = layout)
py.iplot(fig)

# Auto Encoders

In [None]:
pip install pyod

In [None]:
from pyod.models.auto_encoder import AutoEncoder

In [None]:
clf1 = AutoEncoder(hidden_neurons =[15, 10, 6, 2, 2, 6, 10, 15], epochs = 26, contamination = .002)
clf1.fit(df)

In [None]:
y_scores = clf1.decision_scores_
y_scores = clf1.decision_function(df)  #map all points to one a line
y_scores = pd.Series(y_scores)

In [None]:
plt.figure(figsize=(20,7))
plt.axvline(18, color = 'b', alpha = .9)
plt.hist(y_scores, bins=1000)  
plt.title("Histogram for Model Clf1 Anomaly Scores")
plt.show()

Points above 18 are considered as anomalies.

# Thanks for reading! Kindly share your approach on Anomaly Detection.