## Load Libraries

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import pandas as pd

from nltk.corpus import stopwords

from many_stop_words import get_stop_words

from nltk.stem import PorterStemmer #Stemmer
from textblob import Word #Lemmatize

import re
import nltk

#Define some funtions
import srex


## Initialize some objects / variables

In [2]:
st = PorterStemmer()
stop_words_list = stopwords.words('english') #a small one
newStopWords = get_stop_words('en') # a big one
stop_words_list.extend(newStopWords) # all together

## Define functions

def text_transformations(parragraph, stopwords=True, lema=True, stem=True):   
    
    # low the string
    parragraph = parragraph.lower()
    
    # Remove puntuation
    tokens = nltk.word_tokenize(parragraph)
    filtered_parragraph = [w for w in tokens if w.isalnum()]
    
    # Remove Stopwords
    if(stopwords):
        filtered_parragraph = list(filter(lambda word_of_parragraph: (word_of_parragraph not in stop_words_list), filtered_parragraph))
    
    # Apply lematization
    if(lema):
        filtered_parragraph = list(map(lambda word_filtered_parragraph: Word(word_filtered_parragraph).lemmatize(), filtered_parragraph))
    
    # Stemmer
    if(stem):
        filtered_parragraph = list(map(lambda word: st.stem(word), filtered_parragraph))
    
    final_string = ' '.join(map(str, filtered_parragraph))
    
    return final_string

# Calculate the a dictionary with the document's term positions
def get_term_positions_dict(document):
    vectorizer = CountVectorizer()
    vector = vectorizer.build_tokenizer()(document)
    document_positions_dic = defaultdict(list)
    for i in range(len(vector)):
        document_positions_dic[vector[i]].append(i)
    return document_positions_dic

document = "bar bar baz bar guu hee"
doc_term_positions = get_term_positions_dict(document)
print(doc_term_positions)

# Calculate a matrix containing the terms positions from a group (list) of cocuments
def get_documents_positions_matrix(documents):
    term_positions_matrix = []
    for doc in documents:
        positions_dict = get_term_positions_dict(doc)
        term_positions_matrix.append(positions_dict)
    return term_positions_matrix

document1 = "foo bar baz"
document2 = "bar bar baz dee"
document3 = "bar bar baz dee guu"
document4 = "bar bar baz bar guu hee"
documents = [document1, document2, document3, document4]
term_positions_matrix = get_documents_positions_matrix(documents)
print(term_positions_matrix)

# Compare the positions vectors of two terms, and return the list of distances of the terms that are inside limit_distance
def calculate_TermPositions_distances(term_positions1, term_positions2, limit_distance):
    neighborhood_positions = [] 
    for pos1 in term_positions1:
        for pos2 in term_positions2:
            absolute_distance = abs(pos1-pos2)
            if (absolute_distance <= limit_distance):
                neighborhood_positions.append(absolute_distance)
    return neighborhood_positions
print(calculate_TermPositions_distances([1,2,10,20,30], [2,8,22,27,33], 2)) # distances: (1,2)=1 , (10,8)=2 , (20,22)=2 are all <= limit_distance->2
print(calculate_TermPositions_distances([1,10,20,30], [2,8,22,27,33], 3)) # distances: same as before plus (30,27)=3 , (30,33)=3  <= limit_distance->3

# Calculates the vecinity of a term in a doument
# where:
# document_positions_dict : is a dictionary with the positions of all terms in a document
# reference_term : is a term used as reference for calculating wich terms are in its vecinity
# limit_distance : is the maximal distance of terms used to calculate the vecinity
# sumarize : is used to define a function to sumarize the distance of the terms in the vecinity

def get_document_term_vecinity_dict(document_positions_dict, reference_term, limit_distance, sumarize='none', include_reference_term=True):
    vecinity_dict = {}
    # Get the term positions of the reference term
    reference_term_positions = document_positions_dict[reference_term]
    
    # Calculate all terms in document_positions_dict that are at distance limit_distance (or closer) to the reference_term
    # and return a list of these terms and their corresponding distances
    for term, term_positions in document_positions_dict.items():
        # ** HAY QUE REVISAR SI ESTE IF FUNCIONA BIEN
        if((term != reference_term) or (include_reference_term)): # Evita que se compare el termino de referencia consigo mismo
            # ** AQUI HAY UN PROBLEMA CUANDO reference_term_positions TIENE MAS DE UN ELEMENTO **
            neighborhood_positions = calculate_TermPositions_distances(reference_term_positions, term_positions, limit_distance)

            if(len(neighborhood_positions)>0):
                if (sumarize == 'mean'): vecinity_dict[term] = np.mean(neighborhood_positions)
                elif (sumarize == 'median'): vecinity_dict[term] = np.median(neighborhood_positions)
                else: vecinity_dict[term] = neighborhood_positions
        
    return vecinity_dict

def get_vecinity_matrix(document_positions_matrix, reference_term, limit_distance, sumarize, include_reference_term):
    vecinity_matrix = []
    for doc_positions_dic in document_positions_matrix:
        document_term_vecinity_dict = get_document_term_vecinity_dict(doc_positions_dic, reference_term, limit_distance, sumarize, include_reference_term)
        vecinity_matrix.append(document_term_vecinity_dict)
    return vecinity_matrix

def get_collection_vecinity(document_list, reference_term, limit_distance, sumarize_positions, sumarize_vecinity, include_reference_term):
    document_positions_matrix = get_documents_positions_matrix(document_list)
    vecinity_matrix = get_vecinity_matrix(document_positions_matrix, reference_term, limit_distance, sumarize_positions, include_reference_term)
    df = pd.DataFrame.from_dict(vecinity_matrix)
    print(df)
    df_summarized = df.mean()
    df_summarized.sort_values(axis=0, ascending=True, inplace=True, kind='quicksort')
    print(df_summarized)

# Testing

## Define some documents

In [3]:
document1 = "foo bar baz"
document2 = "bar bar baz dee"
document3 = "bar bar baz dee guu"
document4 = "bar bar baz bar guu hee"
documents = [document1, document2, document3, document4]

## Get document positions matrix

In [4]:
documents_positions_matrix = get_documents_positions_matrix(documents)
print(term_positions_matrix)

NameError: name 'get_documents_positions_matrix' is not defined

## Calculate the vecinity matrix for the term 'bar'

In [None]:
vecinity_matrix = get_vecinity_matrix(documents_positions_matrix, 'bar', 2, 'mean', True)
print(vecinity_matrix)

## Calcultes the vecinity DataFrame for the term 'bar'

In [None]:
print(pd.DataFrame.from_dict(vecinity_matrix))

## Calculates the mean distance (graph) of the terms vecinity considering all documents

Here all documents have the same weight.  
For the next version I have to build mean distances considering the document ranking.  
That means, the first documents have more weight than de last documents of the ranking

In [None]:
print(pd.DataFrame.from_dict(vecinity_matrix).mean())

# COMENTARIOS

## 1. Como calcular grafos de distancias a partir de los términos en un documento

La idea es calcular una vecindad de los términos de búsqueda, dentro de un documento. Es decir, que términos se encuentran cerca de los términos de busqueda en un documento determinado.

### Texto Ejemplo
Supongamos el siguiente documento ejemplo donde calcularemos el grafo asociado al término de búsqueda *'languages'*. 

In [None]:
doc = "The European languages are members of the same family." + "Their separate existence is a myth. For science, music, sport, etc, Europe uses the same vocabulary. " + "The languages only differ in their grammar, their pronunciation and their most common words." + "Everyone realizes why a new common language would be desirable: one could refuse to pay expensive translators." + "To achieve this, it would be necessary to have uniform grammar, pronunciation and more common words." + "If several languages coalesce, the grammar of the resulting language is more simple and regular than that of the individual languages." + "The new common language will be more simple and regular than the existing European languages." + "It will be as simple as Occidental; in fact, it will be Occidental." + "To an English person, it will seem like simplified English, as a skeptical Cambridge friend of mine told me what Occidental is." + "The European languages are members of the same family. " + "Their separate existence is a myth. For science, music, sport, etc, Europe uses the same vocabulary. " + "The languages only differ in their grammar, their pronunciation and their most common words. " + "Everyone realizes why a new common language would be desirable: one could refuse to pay expensive translators."

###  **Segmentamos** el documento en un arreglo de párrafos, segun su puntuación.
En general existen algunas dificultades para el calculo de los grafos cuando los documentos tienen distintos tamaños. Esto puede ser enfrentado considerando subconjuntos de documentos correspondientes a los parrafos, generados por puntos seguidos y puntos aparte.

In [None]:
parragraphs_list = doc.split('.')
print(parragraphs_list)

### Aplicamos algunas funciones de **depuración de textos** (stop word, stemming, lemma, etc.), utilizando la función **text_transformations()**

In [None]:
processed_parragraphs_list = list(map(lambda x: text_transformations(x, stopwords=True, lema=True, stem=False), parragraphs_list))
print(processed_parragraphs_list)

### Calculamos la **matriz de posiciones**

In [None]:
doc_pos_matrix = get_documents_positions_matrix(processed_parragraphs_list)
print(doc_pos_matrix)

**Matrix de posiciones de documento 6**

In [None]:
print(doc_pos_matrix[6])

### Calculamos la **vecindad** para el término _'language'_

In [None]:
vecin_matrix = get_vecinity_matrix(doc_pos_matrix, 'language', 5, 'mean', False)
print(vecin_matrix)

**Print Vecinity Matrix as DataFrame**

In [None]:
df_vecin_matrix = pd.DataFrame.from_dict(vecin_matrix)
print(df_vecin_matrix)

### Calculates the **mean distance** (graph) of the terms vecinity considering all documents

In [None]:
print(pd.DataFrame.from_dict(vecin_matrix).mean())

### Distancia Límite de Cálculo
Para calcular la vecindad a un termino de búsqueda definimos una distancia límite de cálculo (*limit_distance*). Por ejemplo, si *limit_distance*=5, el cálculo de la vecindad considerará sólo hasta 5 terminos de distancia desde la posición del término de búsqueda.  
Con esto se evita que se realicen *comparaciones cruzadas* entre terminos de búsqueda que se encuentran distribuidos a lo largo del documento.

## 2. Matrices de Distancias entre Términos

* La matriz de vecindad que define la distancia entre un término de referencia (busqueda) y los demás términos del documento fiene las siguientes características
  * Los términos no presentes en el documento y que aparaecen en la matriz con el valor *NaN*, pueden ser considerados con distancia _infinita_ al termino de referencia.
  * Los termminos de referencia (búsqueda) aparecen en la matriz con distancia *cero*. También se puede modificar el algoritmo para que no aparezcan en la matriz.


In [None]:
print(pd.DataFrame.from_dict(vecinity_matrix)) # Esta es la matriz de distancias para el témino 'bar'

## 3. Donde aplicar los Grafos

Existen varias alternativas donde aplicar los grafos:
1. Definir un grafo por el ranking
  * Un problema con esta alternativa es como asociar cambios en el grafo a un nuevo ranking
2. Definir un grafo por documento
3. Definir un grafo por párrafo
  * Esto elimina en parte el problema que generan los documentos de tamaño distinto
  
Probaremos la alternativa (3) de definir un grafo por parrafo, para luego explorar una forma para integrar la información de los grafos que componen el documento en un sólo grafo.

## 4. ¿Cómo operar sobre Grafos?

### Sumar grafos

Para sumar dos grafos asociados a un documento hay que considerar 
1. Que hacer con los terminos iguales
2. Que hacer con los terminos distintos
3. etc

### Calcular el valor medio

Para calcular el valor medio de un grupo de grafos hay que considerar:
1. Cada grafo puede tener una ponderación distinta
2. Los grafos pueden tener palabras distintas
3. etc.

### Comparar la similitud 

Tenemos que definir un indicador de similitud entre grafos

In [None]:
s="XDBABDCACXC"

In [None]:
indices = [i for i, x in enumerate(s) if x == "A"]

In [None]:
indices