<a href="https://colab.research.google.com/github/jhermosillo/keyword_extraction/blob/main/API_TextRank_GFractal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Módulos necesarios

In [1]:
!pip install deplacy
!pip install spacy

Collecting deplacy
  Downloading deplacy-2.0.2-py3-none-any.whl (22 kB)
Installing collected packages: deplacy
Successfully installed deplacy-2.0.2


In [2]:
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting es_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2 MB)
[K     |████████████████████████████████| 16.2 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


In [1]:
import pkg_resources,imp
imp.reload(pkg_resources)

<module 'pkg_resources' from '/usr/local/lib/python3.7/dist-packages/pkg_resources/__init__.py'>

### Librerías necesarias para los algoritmos

In [3]:
from math import *
from math import sqrt
import string
import operator
import random
import pandas as pd
#librerias necesarias para text rank
from collections import OrderedDict
import numpy as np
import spacy

#Listado de STOPWORDS dependiendo del lenguaje
from spacy.lang.en.stop_words import STOP_WORDS
#from spacy.lang.es.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('es_core_news_sm')

# Algoritmo TextRank

In [4]:
class TextRank4Keyword():
    """Extract keywords from text"""

    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 100 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        keysw={}
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            keysw[key] =value
            if i > number:
                break
        return keysw
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
        

# Algoritmo Grado de Fractalidad

In [224]:
#solamente se calcula el grado de fractalidad de las palabras que tengan mas de uno de frecuencia
def fractalidad(palabras,vocabulario,frec,dist):
    N=len(palabras)                                     #El número de tokens de todo el texto
    gf={}
    cajas_index=set()
    voc=[]                                             #la variable voc contendra cada sintagma con frecuencia mayor que 1, por que las otras palabras tendrán 0 de grado de fractaldiad
    for p in vocabulario:                              #Esto se puede hacer fuera del algoritmo, pero se incluye para evitar ese calculo innecesario 
        if(p not in voc):
            if(frec[p]>1):
                if(p not in STOP_WORDS):
                    if(len(p)>1):
                        voc.append(p)
    # print("Text size: ",N)
    # print("Vocabulary: ",len(voc))
    for p in voc:                                  
        rcajas=dist[p]
        M=frec[p]                                  
        dfw=0.0
        nsh=0.0
        for s in range(1,N+1):  
            noc=0                                       
            for e in rcajas:                       
                cajas_index.add(ceil(int(e)/s))    
            noc=len(cajas_index)                    
            cajas_index.clear()    
            ns=N/s
            if(M<=ns):
                nsh=M
            else:
                nsh=M/(1+(M-1)/(N-1)*(s-1)) 
            dfw=dfw+fabs(log(nsh/noc))
        gf[p]=dfw
    return gf    #regresamos un diccionario

In [225]:
def distribucion(palabras,vocabulario):
    N=len(palabras)
    ncajas=[]
    cajas={}
    frecuencias={}
    for p in vocabulario:
        ncajas.clear()
        i=0
        M=palabras.count(p)
        while(i<N):
            if(p == palabras[i]):
                ncajas.append(i+1)
            i=i+1
        frecuencias[p]=M
        cajas[p]=ncajas[:]
    return frecuencias,cajas

# Lectura de archivo de entrada

In [226]:
#Lectura de archivo para generación de vocabulario
def cargar_datos(filename):
    f=open(filename, "r") #tenemos que crear un directorio llamado InputData
    texto=f.read()
    #Pasar a minusculas
    texto=texto.lower()
    #Eliminar puntuación
    texto=texto.translate(str.maketrans('', '', string.punctuation))
    texto=texto.translate(str.maketrans('', '', '¿¡—“”0123456789’'))
    palabras=texto.split()
    textop=""
    #rearmamos el texto debido a ue existen carácteres especiales
    for w in palabras:
        textop=textop+w+' '
    return textop

DEFINICIÓN DEL NOMBRE DEL ARCHIVO A PROCESAR

Lectura de documentos

In [227]:
def lee_documento(filename='NULL',texto=''):
  if filename != 'NULL':
    texto=cargar_datos(filename)
  #obtenemos el vocabulario
  tokens=texto.split()
  vocabulario=[]
  for t in tokens:
      if(t not in vocabulario):
          vocabulario.append(t)
  #variables de procesamiento
  dist={}
  frec={}
  frec,dist=distribucion(tokens,vocabulario)
  return frec,dist,tokens,vocabulario,texto

# Ejecución de algoritmos y generación de archivos de salida

In [228]:
# lectura de documento de prueba
frec,dist,tokens,vocabulario,texto = lee_documento('data.txt')

## Grado de Fractalidad

In [229]:
#ejecución de algoritmo Grado de Fractalidad
def grado_de_fractalidad(tokens,vocabulario,frec,dist,regresa_kw=False,regresa_df=True,top_n=np.inf,escribe_arch=False):
  frac_x=fractalidad(tokens,vocabulario,frec,dist) 
  sorted_x = sorted(frac_x.items(), key=operator.itemgetter(1), reverse=True)
  # print('Time GF: '+str(elapsed_time))

  #Imprimir y guardar resultados de GF
  if regresa_df:
    if top_n != np.inf:
      df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x[:top_n]]
    else:
      df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x]
    #Ordenar resultados por medida combinada
    df.sort(key=lambda x: x[3],reverse=True)
    if regresa_kw==False:
      df = [dato[0] for dato in df]
      by_MC=pd.DataFrame(df, columns=['word'])
    else:   
      by_MC=pd.DataFrame(df, columns=['word','frecuency','Degree_of_fractality','Combined_measure'])
    if escribe_arch:
      by_MC.to_csv('GF.csv')
  else:
    if top_n != np.inf:
      df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x[:top_n]]
    else:
      df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x]
    #Ordenar resultados por medida combinada
    df.sort(key=lambda x: x[3],reverse=True)
    if regresa_kw==False:
      by_MC = [dato[0] for dato in df]
    else:
      by_MC = df
    if escribe_arch:
      print('\nNo se tiene implementada la escritura de archivo cuando regresa_df==False\n')
  return by_MC

def use_gf(texto,regresa_kw=False,regresa_df=False,top_n=np.inf,escribe_arch=False):
  tokens=texto.split()
  vocabulario=[]
  for t in tokens:
      if(t not in vocabulario):
          vocabulario.append(t)
  #variables de procesamiento
  dist={}
  frec={}
  frec,dist=distribucion(tokens,vocabulario)
  df = grado_de_fractalidad(tokens,vocabulario,frec,dist,regresa_kw,regresa_df,top_n,escribe_arch)
  return df

df1 = use_gf(texto,regresa_kw=True,regresa_df=True,top_n=10)
df1

Unnamed: 0,word,frecuency,Degree_of_fractality,Combined_measure
0,latin,4,167.127394,100.620717
1,content,3,205.903714,98.241038
2,humour,3,163.880039,78.19065
3,injected,3,163.20027,77.866318
4,dummy,2,191.726724,57.715495
5,type,2,191.726724,57.715495
6,line,2,179.412131,54.008433
7,internet,2,175.612973,52.864773
8,reproduced,2,169.604962,51.056181
9,typesetting,2,166.080279,49.995146


## TextRank

In [230]:
#ejecución de algoritmo de TextRank
# start_time = time()
def use_TextRank(texto,regresa_kw=False,regresa_df=False,top_n=np.inf,escribe_arch=False):
  tr4w = TextRank4Keyword()
  tr4w.analyze(texto, candidate_pos = ['NOUN','PROPN'], window_size=4, lower=False)
  kwTR=tr4w.get_keywords(100)

  #Guardar resultados de TextRank
  if regresa_df:
    if top_n!=np.inf:
      if regresa_kw==True:
        salida = [[key, kwTR[key]] for key in kwTR.keys()][:top_n]
        dftr=pd.DataFrame(salida, columns=['word', 'Index'])
      else:
        salida = list(kwTR.keys())[:top_n]
        dftr=pd.DataFrame(salida, columns=['word'])
    else:
      if regresa_kw==True:
        salida = [[key, kwTR[key]] for key in kwTR.keys()]
        dftr=pd.DataFrame(salida, columns=['word', 'Index'])
      else:
        salida = list(kwTR.keys())
        dftr=pd.DataFrame(salida, columns=['word'])
  else:
    if top_n!=np.inf:
      if regresa_kw==True:
        dftr = [[key, kwTR[key]] for key in kwTR.keys()][:top_n]
      else:
        dftr = list(kwTR.keys())[:top_n]
    else:
      if regresa_kw==True:
        dftr = [[key, kwTR[key]] for key in kwTR.keys()]
      else:
        dftr = list(kwTR.keys())
  # elapsed_time = time() - start_time
  # print('Time TextRank: '+str(elapsed_time))
  if escribe_arch:
    dftr.to_csv('TextRank.csv')
  return dftr

dftr = use_TextRank(texto,top_n=10)
dftr

['ipsum',
 'lorem',
 'versions',
 'words',
 'text',
 's',
 'content',
 'page',
 'model',
 'form']

# 20 Newsgroups

In [12]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [231]:
def clean_text(r):
  # r=pd.DataFrame([reg],columns=['cadena'])
  def check_STW(palabras):
    texto = [w for w in palabras \
             if not w in STOP_WORDS and len(w)>2]
    return texto
  r=r.cadena.str.translate(\
          str.maketrans('','',string.digits))\
          .str.translate(\
          str.maketrans('','',string.punctuation))\
          .str.replace('«','')\
          .str.replace('»','').str.replace('(','')\
          .str.replace('\n',' ')\
          .str.replace(')','').str.strip()
  r=r.to_frame().applymap(lambda x : x.split()).applymap(check_STW).applymap(lambda x: ' '.join(x))
  r = r.rename(columns={'cadena':'docs'})
  # palabras = r[0].split()
  # # palabras = r.tolist()
  # # elimino las stopwords
  # texto = [w for w in palabras \
  #           if not w in STOP_WORDS and len(w)>2]
  # doc = ' '.join(texto)
  return r

# reg=newsgroups_train.data[0]

# frec,dist,tokens,vocabulario,texto = lee_documento('NULL',doc)
# df_frac = grado_de_fractalidad(tokens,vocabulario,frec,dist).reset_index()
# df_tr = use_TextRank(texto).reset_index()
# df_tr.head()

In [239]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=cats)

data = pd.DataFrame(newsgroups_train.data, columns=['cadena'])
data['target'] = pd.Series(newsgroups_train.target)
data = clean_text(data.head())
dfTR = data.docs.apply(lambda x : use_TextRank(x,top_n=5)).to_frame()
dfFT = data.docs.apply(lambda x : use_gf(x,top_n=5)).to_frame()
dfFT

  


Unnamed: 0,docs
0,"[nature, rules, god, omnipotence, definition]"
1,"[funding, data, Magellan]"
2,[LEO]
3,"[Gehrels, spectroscopy, ray, gamma, observatory]"
4,"[baby, pregnant]"
