# Explanatory Anomaly Detection with ConceptNet and GloVe.

This system demos the reasonableness monitoring system and anomaly detection through explanations with ConceptNet and GloVe.   

## Imports and Setups

In [1]:
#Imports Cell
from collections import defaultdict
import numpy as np
import gensim
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'gensim'

In [None]:
#Put the path to glove here
path = "~/Dropbox (MIT)/car-can-explain/glove.6B.50d.txt.w2v"

#Now load the model into the variable "glove" (may take some time)
glove = KeyedVectors.load_word2vec_format(path, binary=False)

In [6]:
# For the reasonableness monitor
import commonsense.conceptnet as kb
import monitor.reasonableness_monitor as monitor
import synthesizer

## Glove Embedding Helper Functions

In [None]:
def convert_to_embeddings(domain:set) -> list:
    """
    Converts the domain of terms to a list of related embeddings
    
    Parameters
    -------------
    domain: set
    The set of terms that define the domain, each term is a string
    
    Returns
    ---------
    list of glove embeddings
    """
    return [glove[word] for word in domain]

In [None]:
# Function based on all the computations above
def calcuate_distances(label_set_a:list, 
                       label_set_b:list) -> np.ndarray:
    """
    This function takes in two sets of glove embeddings vectors and returns the min distances between the two
    
    Parameters
    -------------    
    label_set_a : list 
            the first set of glove embedding vectors from one input source
    label_set_b : list
            the second set of glove embedding vectors from the second source
    
    Returns
    ---------
    numpy.ndarray
        The list of distances, where length = max(len(label_set_a),len(label_set_b))
    """
    
    #Turn both into numpy arrays
    arr_a = np.array(label_set_a)
    arr_b = np.array(label_set_b)
    
    #Square and transform as needed
    a = np.sum(arr_a**2,axis = 1)[:,np.newaxis]
    b = np.sum(arr_b**2,axis = 1)
    
    #Calculate the distances and take the square root
    #We are also cutting off where values too small
    dists = a + b - 2*np.matmul(arr_a,arr_b.T)
    dists[dists < 1e-6] = float(0.0)
    dists = np.sqrt(dists)
    
    #Return the minimum values across the axis with more glove embeddings
    return np.min(dists,axis = np.argmin(dists.shape))

In [None]:
def gen_domain(list_domain:list, depth:int) -> set:
    """
    This function takes in a list of strings, which represents the domain, and generates the relavent list of glove embeddings that represents this domain. 
    
    This does it through a recursive methodology
    
    
    Parameters
    -------------
    list_domain: list
    List of string terms that represent the domain
    
    depth: int
    How many layers should be used to generate the domain
    
    Returns
    ---------
    Set of strings of the terms that we should get the glove embeddings for
    """
    

    list_embeddings = set() #The final set representing the domain embeddings
    
    if depth == 0: #If we have gotten back to depth 0 it means we have added all the words to the depth we want
        return list_embeddings
    
    
    #We will use a BFS type function to generate our domain
    queue = set(list_domain)
    
    for word in queue: #For each domain word
        
        #Add the first thing in the queue to our list
        list_embeddings.add(word)
        
        #Get similar terms
        list_terms = set(word for word, similarity in glove.similar_by_vector(glove[word]))
        #Add its most similar words using a recursive call
        list_embeddings.update(gen_domain(list_terms, depth - 1))
    
    return list_embeddings    

In [None]:
def to_df(concepts, domains):
    """
    Takes in a list of concepts and a list of domains and outputs a pandas dataframe
    of concept-domain glove score.
    
    """
    

    domain = list(gen_domain(domains, 2))
    domain_embeddings = convert_to_embeddings(domain)
    
    concept_embeddings = []
    for concept in concepts: 
        concept_embeddings.append(convert_to_embeddings(concept))
    
    concept_dom_score = {}
    
    i = 0
    for concept in concept_embeddings:
        distances = list(calcuate_distances(concept, domain_embeddings))
        concept_dom_score[concepts[i]] = []
        j = 0
        for score in distances:
            concept_dom_score[concepts[i]].append([domain[j], score])
            j += 1
        i += 1
    
    
    d = {'Word1':[], 'Relation':[], 'Word2':[], 'Score':[]}
    
    list_distances = list(distances)
    
    
    for concept in concept_dom_score:
        i=0
        for score in concept_dom_score[concept]:   
            d['Word1'].append(concept)
            d['Relation'].append('glove')
            d['Word2'].append(score[0])
            d['Score'].append(score[1])
            i += 1
    
    df = pd.DataFrame(data=d)
    
    return df

## Demo Integration

In [8]:
cn = kb.ConceptNet()

description = 'penguin eats food'
#concepts = ['penguin', 'food']
description = 'penguin crossing the street'
concepts = ['penguin', 'street']
commonsense_facts = cn.build_df(concepts)
commonsense_facts

NameError: name 'find_anchor_with_score' is not defined

In [None]:
kb.get_domain(commonsense_facts)

In [None]:
# filtered = commonsense_facts[commonsense_facts['Word1']=='a penguin']

# kb.get_domain(filtered)

In [None]:
glove_facts = to_df(concepts,['zoo'])
glove_facts = to_df(concepts, ['street'])
glove_facts

In [None]:
import synthesizer.argue as debate

debate.challenge(commonsense_facts, glove_facts, description)