In [1]:
import datetime
import pandas as pd
import numpy as np
import pandas.io.data as pio
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("_births_and_deaths.csv")

In [3]:
df.head()

Unnamed: 0,Quarter,Male Live Births,Female Live Births,Male Deaths,Female Deaths
0,2000Q1,7639,7139,3346,3070
1,2000Q2,7365,6866,3372,3178
2,2000Q3,7174,6843,3675,3511
3,2000Q4,6979,6600,3357,3151
4,2001Q1,7496,7232,3231,3070


In [4]:
df.shape

(52, 5)

In [5]:
from sklearn.cluster import MeanShift, estimate_bandwidth

In [6]:
X = df.drop("Quarter",axis=1)

In [7]:
bandwidth = estimate_bandwidth(np.array(X), quantile=0.2, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(np.array(X))
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)

number of estimated clusters : 5


In [9]:
labels

array([1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 0, 4, 2, 0, 1, 2,
       0, 1, 0, 4, 0, 3, 1, 3, 3, 1, 3, 3, 3, 1, 1, 3, 1, 3, 1, 3, 1, 3, 1,
       4, 2, 1, 1, 4, 1])

In [10]:
df["label"] = labels

In [14]:
df_gb = df.groupby(by = "label")
df_f = df_gb.count("Male Deaths")

In [16]:
df_cluster = df_f.reset_index()

In [19]:
df_cluster.sort(columns="Male Live Births", ascending=False)

Unnamed: 0,label,Quarter,Male Live Births,Female Live Births,Male Deaths,Female Deaths
0,0,16,16,16,16,16
1,1,15,15,15,15,15
3,3,10,10,10,10,10
2,2,7,7,7,7,7
4,4,4,4,4,4,4


In [21]:
df.drop(["Quarter", "label"],axis=1,inplace=True)

In [23]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.patches as mPatch
from matplotlib.legend_handler import HandlerLine2D

#knn function gets the dataset and calculates K-Nearest neighbors and distances
def knn(df,k):
    nbrs = NearestNeighbors(n_neighbors=3)
    nbrs.fit(df)
    distances, indices = nbrs.kneighbors(df)
    return distances, indices

#reachDist calculates the reach distance of each point to MinPts around it
def reachDist(df,MinPts,knnDist):
    nbrs = NearestNeighbors(n_neighbors=MinPts)
    nbrs.fit(df)
    distancesMinPts, indicesMinPts = nbrs.kneighbors(df)
    distancesMinPts[:,0] = np.amax(distancesMinPts,axis=1)
    distancesMinPts[:,1] = np.amax(distancesMinPts,axis=1)
    distancesMinPts[:,2] = np.amax(distancesMinPts,axis=1)
    return distancesMinPts, indicesMinPts

#lrd calculates the Local Reachability Density
def lrd(MinPts,knnDistMinPts):
    return (MinPts/np.sum(knnDistMinPts,axis=1))

#Finally lof calculates lot outlier scores
def lof(Ird,MinPts,dsts):
    lof=[]
    for item in dsts:
       tempIrd = np.divide(Ird[item[1:]],Ird[item[0]])
       lof.append(tempIrd.sum()/MinPts)
    return lof

#We flag anything with outlier score greater than 1.2 as outlier#This is just for charting purposes
def returnFlag(x):
    if x['Score']>1.2:
       return 1
    else:
       return 0

#Read the file to data frame
data = df

#You can change below value for different MinPts
m=15

knndist, knnindices = knn(data,3)
reachdist, reachindices = reachDist(data,m,knndist)
irdMatrix = lrd(m,reachdist)
lofScores = lof(irdMatrix,m,reachindices) 
scores= pd.DataFrame(lofScores,columns=['Score'])
mergedData=pd.merge(data,scores,left_index=True,right_index=True)
mergedData['flag'] = mergedData.apply(returnFlag,axis=1)
Outliers = mergedData[(mergedData['flag']==1)]
Normals = mergedData[(mergedData['flag']==0)]

In [24]:
Outliers

Unnamed: 0,Male Live Births,Female Live Births,Male Deaths,Female Deaths,Score,flag
6,6873,6783,3914,4000,1.3579,1
9,6713,6438,3523,3365,1.333963,1
10,7061,6634,3959,4003,1.295122,1
14,7125,6842,4073,4113,1.449755,1
28,8602,7934,3450,3436,1.325092,1
33,8597,8053,3617,3625,1.279122,1
34,8262,7854,4075,4198,1.377244,1
38,8264,8047,3999,3994,1.394383,1
40,8756,8212,3289,3211,2.000142,1
46,7961,7563,4079,4247,1.278596,1
