# Risk & reliability pipeline

This notebook encapsulates an end-to-end pipeline for the iDIME project using [toy data(1)](https://people.rit.edu/fa3019/MaintNet/data_aviation.html).

The data comprises of two types of documents: `inspection` and `maintenance`. Each document is a text description of the fault and maintenance action done for that fault.

The pipeline has the following steps:

1. Extracting the maintenance data into a pandas dataframe with two columns. Each column represents `inspection` or `maintenance` actions.
2. Converting the text data into vector embeddings for each column using the `gensim` library.
3. Using manifold learning to reduce the dimensionality of the vector embeddings for each column.
4. Using clustering to identify common modes in the data columns.
5. Discretizing dataset. Representing each row of each column by the cluster it belongs to.
6. Learning a Bayes net from the discretized model (3).

![](https://imgur.com/EOjDofq.png)

Citations:

1. Akhbardeh, Farhad, Travis Desell, and Marcos Zampieri. "Maintnet: A collaborative open-source library for predictive maintenance language resources." arXiv preprint arXiv:2005.12443 (2020).
2. Řehůřek, Radim, and Petr Sojka. "Gensim—statistical semantics in python." Retrieved from genism. org (2011).
3. Taskesen, E. (2020). Learning Bayesian Networks with the bnlearn Python Package. (Version 0.3.22) [Computer software]. https://erdogant.github.io/bnlearn



In [None]:
# see here for sentence transfrmers : https://www.sbert.net/
# https://huggingface.co/sentence-transformers/all-mpnet-base-v2

In [None]:
import numpy as np
import pandas as pd
import gensim
from sklearn.pipeline import Pipeline

#TODO: try different manifold leaning algorithms:
from sklearn.manifold import Isomap
from sklearn.preprocessing import Normalizer

#TODO: try out different clustering algorithms:
from sklearn.cluster import HDBSCAN, DBSCAN
from sklearn.mixture import BayesianGaussianMixture

import matplotlib.pyplot as plt
# import plotly
# import plotly.express as px
# import plotly.graph_objects as go

## Parsing

In [None]:
# Reading the toy dataset, and showing a sample of rows
DOWNLOAD = True
PROD = False
if DOWNLOAD:
    df = pd.read_csv('https://people.rit.edu/fa3019/technical/data/maintnet_aviation_dataset_deidentified.csv', index_col='IDENT')
    n = len(df)
    df['Inspection'] = df.Inspection
    df['Maintenance'] = df.Maintenance
    df['TimeCost'] = np.random.rand(n) * 10
    del df['PROBLEM']
    del df['ACTION']
    df.to_csv('./data/maintnet.csv')
elif not PROD:
    df = pd.read_csv('./data/maintnet.csv')
elif PROD:
    df = pd.read_csv('./data/acn.csv')

df.head()

In [None]:
# Converting the dataframe in to 2 corpora of documents for gensim.
# gensim will independently process each class of documents (inspection, maintenance)
# for later analysis.
def read_documents(s: pd.Series, tokens_only=False):
        for i, line in enumerate(s):
            tokens = gensim.utils.simple_preprocess(line.lower())
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

documents_insp = list(read_documents(df.Inspection))
documents_main = list(read_documents(df.Maintenance))

## Embedding

In [None]:
# Using Doc2Vec, represent each document (each row in a column) as a vector
# Two models are learned, each for inspection and maintenance documents
VECTOR_SIZE = 32
def make_model(corpus, vector_size=VECTOR_SIZE):
    model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=2, epochs=40)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

model_insp = make_model(documents_insp)
model_insp.save('./bin/doc2vec_insp')
model_main = make_model(documents_main)
model_main.save('./bin/doc2vec_main')

In [None]:
# Query the model for similar documents. Once the index of a similar inspection
# document is known, the corresponding maintenance action can be pulled up:
inspection_report = 'engine not starting'
vector = model_insp.infer_vector(
    gensim.utils.simple_preprocess(inspection_report.lower())
    )
sims = model_insp.dv.most_similar([vector], topn=10)
print('Most similar records:\n')
for idx, score in sims:
    print('INSP:', df.iloc[idx].Inspection)
    print('MAIN:', df.iloc[idx].Maintenance)
    print()

## LSI

In [None]:
from gensim.models import LsiModel
from gensim.models import TfidfModel
from gensim import corpora

texts = [d.words for d in documents_insp]
dictionary = corpora.Dictionary(texts)
corpus_insp = [dictionary.doc2bow(t) for t in texts]
tfidf = TfidfModel(corpus=corpus_insp)
corpus_tfidf = tfidf[corpus_insp]

lsi = LsiModel(corpus=corpus_insp, id2word=dictionary)
corpus_lsi = lsi[corpus_tfidf]

In [None]:
corpus_lsi.

## Clustering

In [None]:
# Use a manifold learning algorithm to reduce dimensionality for each document class
DO_MANIFOLD = True
if VECTOR_SIZE > 3 and DO_MANIFOLD:
    embedding = Isomap(n_neighbors=5, n_components=3, metric='cosine')
    x_insp = embedding.fit_transform(model_insp.dv.vectors)
    x_main = embedding.fit_transform(model_main.dv.vectors)
    predict_embedding = embedding.transform
else:
    x_insp = model_insp.dv.vectors.copy()
    x_main = model_main.dv.vectors.copy()
    predict_embedding = lambda x: x

In [None]:
# Cluster the embeddings
pipe_insp = Pipeline([
    ('normalization', Normalizer()),
    ('clustering', HDBSCAN(min_cluster_size=100, metric='euclidean'))
    # ('clustering', DBSCAN(n_components=20))
])
y_insp = pipe_insp.fit_predict(x_insp)

pipe_main = Pipeline([
    ('normalization', Normalizer()),
    ('clustering', HDBSCAN(min_cluster_size=50, metric='euclidean'))
    # ('clustering', DBSCAN(n_components=20))
])
y_main = pipe_main.fit_predict(x_main)

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2, specs=[[dict(type='scene'), dict(type='scene')]])
scatter_kwargs = dict()

fig.add_trace(
    go.Scatter3d(x=x_insp[:,0], y=x_insp[:,1], z=x_insp[:,2], text=df.Inspection,
                 name='Inspection',
                 mode='markers',
                 marker=dict(
                    size=2,
                    color=y_insp,
                    colorscale='Viridis',
                    opacity=0.8
                )),
    row=1, col=1
)

fig.add_trace(
    go.Scatter3d(x=x_insp[:,0], y=x_insp[:,1], z=x_insp[:,2], text=df.Maintenance,
                 name='Maintenance',
                 mode='markers',
                 marker=dict(
                    size=2,
                    color=y_main,
                    colorscale='Viridis',
                    opacity=0.8
                )),
    row=1, col=2
)

fig.update_layout(height=600, width=1200, title_text="")
fig.show()

## Bayesian learning

In [None]:
# Discretize the data using cluster labels
ddf = pd.DataFrame({'insp': y_insp, 'main': y_main}, index=df.index)
ddf.head()

In [None]:
# Learn a bayesian model, assuming a node structure
import bnlearn as bn

edges = [list(ddf.columns),]
dag = bn.make_DAG(edges)
# parameter learning
model = bn.parameter_learning.fit(dag, ddf)
model = bn.independence_test(model, ddf, prune=False)
bn.plot(model, interactive=False, params_static = {'width':4, 'height':4,})

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(x_insp, y_insp)

vectorize = lambda x: model_insp.infer_vector(
    gensim.utils.simple_preprocess(x.lower())
    )

def predict_insp_cluster(text, vectorize=vectorize, embed=predict_embedding, cluster=kn.predict):
    v = vectorize(text)
    e = embed([v])
    c = cluster(e)
    return c[0]

In [None]:
predict_insp_cluster('gasket is leaking')

In [None]:
import plotly.graph_objects as go
m = model['model']
t=m.cpds[1]
t.values
fig = go.Figure(data =
    go.Contour(
        z=t.values,
        x=sorted(list(set(ddf.insp))),
        y=sorted(list(set(ddf.main)))
    ))
fig.update_layout(width=500, height=500, xaxis_title='Insp', yaxis_title='Main', title='Conditional Probability distribution P(main | insp)')
fig.show()

In [None]:
# Given some evidence (say, about inspection 'insp'), make predictions about
# maintenance 'main'
# Inference
evidence = {
    'insp': predict_insp_cluster('gasket is leaking'),
    }
res=bn.inference.fit(model, variables=['main',],
                     evidence=evidence, verbose=0)

In [None]:
examples = res.sample(10).values.flatten()
fig = px.histogram(examples)
fig.update_layout(width=600, height=300, xaxis_title='Maintainance action cluster')

In [None]:
labels = pipe_insp.named_steps['clustering'].labels_
unique_labels, counts = np.unique(examples, return_counts=True)
for l, c in zip(unique_labels, counts):
    idx = np.arange(len(examples))[examples==l]
    idx = np.random.choice(idx, size=c)
    print('\nCluster %d, %d samples\n================' % (l,c))
    for i in idx:
        print(df.Maintenance.iloc[i])