In [None]:
import pandas as pd
import numpy as np

import matplotlib
import seaborn
import matplotlib.dates as md
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import datetime
import time
import re

# return Series of distance between each point and his distance with the closest centroid
def getDistanceByPoint(data, model):
    distance = pd.Series()
    for i in range(0,len(data)):
        Xa = np.array(data.loc[i])
        Xb = model.cluster_centers_[model.labels_[i]-1]
        distance.set_value(i, np.linalg.norm(Xa-Xb))
    return distance

#Formats a dataframe to be used directly with scikit
def min_max_format(df):
    min_max_scaler = preprocessing.StandardScaler()
    df= df[~np.isnan(df['MDI_OBD_FUEL']) ]
    df = df[np.isfinite(df['meter_filled'])]
    time = df['recorded_at']
    X = pd.DataFrame(min_max_scaler.fit_transform(df[['MDI_OBD_FUEL','speed_filled','meter_filled','distance_cum','avrg_speed']]),index=df.index)

    return [X,time]

file_name = "data/data_asset_choosed/487_None_old*/487_51.csv"
df_asset = pd.read_csv(file_name,parse_dates=[1])
df_asset.head(10)

In [None]:
# Take useful feature and standardize them
data = min_max_format(df_asset)[0]
# reduce to 2 importants features
pca = PCA(n_components=2)
data = pca.fit_transform(data)
# standardize these 2 new features
min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)

In [None]:
df = df_asset[~np.isnan(df_asset['MDI_OBD_FUEL']) ]
df = df[np.isfinite(df['meter_filled'])]
df = df.reset_index(drop=True)
df['cluster'] = kmeans[3].predict(data)
df['principal_feature1'] = data[0]
df['principal_feature2'] = data[1]
df['cluster'].value_counts()

In [None]:
df_class0 = df.loc[df['cluster'] == 0, 'MDI_OBD_FUEL']
df_class1 = df.loc[df['cluster'] == 1, 'MDI_OBD_FUEL']
df_class2 = df.loc[df['cluster'] == 2, 'MDI_OBD_FUEL']
df_class3 = df.loc[df['cluster'] == 3, 'MDI_OBD_FUEL']

In [None]:
fig, axs = plt.subplots(2,2)
df_class0.hist(ax=axs[0,0],bins=32)
df_class1.hist(ax=axs[0,1],bins=32)
df_class2.hist(ax=axs[1,0],bins=32)
df_class3.hist(ax=axs[1,1],bins=32)


In [None]:
# apply ellipticEnvelope(gaussian distribution) at each categories
envelope =  EllipticEnvelope(contamination = outliers_fraction) 
X_train = df_class0.values.reshape(-1,1)
envelope.fit(X_train)
df_class0 = pd.DataFrame(df_class0)
df_class0['deviation'] = envelope.decision_function(X_train)
df_class0['anomaly'] = envelope.predict(X_train)

envelope =  EllipticEnvelope(contamination = outliers_fraction) 
X_train = df_class1.values.reshape(-1,1)
envelope.fit(X_train)
df_class1 = pd.DataFrame(df_class1)
df_class1['deviation'] = envelope.decision_function(X_train)
df_class1['anomaly'] = envelope.predict(X_train)

envelope =  EllipticEnvelope(contamination = outliers_fraction) 
X_train = df_class2.values.reshape(-1,1)
envelope.fit(X_train)
df_class2 = pd.DataFrame(df_class2)
df_class2['deviation'] = envelope.decision_function(X_train)
df_class2['anomaly'] = envelope.predict(X_train)

envelope =  EllipticEnvelope(contamination = outliers_fraction) 
X_train = df_class3.values.reshape(-1,1)
envelope.fit(X_train)
df_class3 = pd.DataFrame(df_class3)
df_class3['deviation'] = envelope.decision_function(X_train)
df_class3['anomaly'] = envelope.predict(X_train)

In [None]:
# plot the temperature repartition by categories with anomalies
a0 = df_class0.loc[df_class0['anomaly'] == 1, 'MDI_OBD_FUEL']
b0 = df_class0.loc[df_class0['anomaly'] == -1, 'MDI_OBD_FUEL']

a1 = df_class1.loc[df_class1['anomaly'] == 1, 'MDI_OBD_FUEL']
b1 = df_class1.loc[df_class1['anomaly'] == -1, 'MDI_OBD_FUEL']

a2 = df_class2.loc[df_class2['anomaly'] == 1, 'MDI_OBD_FUEL']
b2 = df_class2.loc[df_class2['anomaly'] == -1, 'MDI_OBD_FUEL']

a3 = df_class3.loc[df_class3['anomaly'] == 1, 'MDI_OBD_FUEL']
b3 = df_class3.loc[df_class3['anomaly'] == -1, 'MDI_OBD_FUEL']

fig, axs = plt.subplots(2,2)
axs[0,0].hist([a0,b0], bins=32, stacked=True, color=['blue', 'red'], label=['normal', 'anomaly'])
axs[0,1].hist([a1,b1], bins=32, stacked=True, color=['blue', 'red'], label=['normal', 'anomaly'])
axs[1,0].hist([a2,b2], bins=32, stacked=True, color=['blue', 'red'], label=['normal', 'anomaly'])
axs[1,1].hist([a3,b3], bins=32, stacked=True, color=['blue', 'red'], label=['normal', 'anomaly'])
plt.legend()
plt.show()

In [None]:
# add the data to the main 
df_class = pd.concat([df_class0, df_class1, df_class2, df_class3])
df['anomaly22'] = df_class['anomaly']
df['anomaly22'] = np.array(df['anomaly22'] == -1).astype(int) 

