<a href="https://colab.research.google.com/github/jhermosillo/keyword_extraction/blob/main/API_TextRank_GFractal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Módulos necesarios

In [1]:
!pip install deplacy
!pip install spacy



In [2]:
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Collecting es-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.2.0/es_core_news_sm-3.2.0-py3-none-any.whl (14.0 MB)
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('es_core_news_sm')


In [3]:
import pkg_resources,imp
imp.reload(pkg_resources)

<module 'pkg_resources' from 'C:\\Users\\Jorge Hermosillo\\anaconda3\\lib\\site-packages\\pkg_resources\\__init__.py'>

### Librerías necesarias para los algoritmos

In [4]:
from math import *
from math import sqrt
import string
import operator
import random
import pandas as pd
#librerias necesarias para text rank
from collections import OrderedDict
import numpy as np
import spacy

#Listado de STOPWORDS dependiendo del lenguaje
from spacy.lang.en.stop_words import STOP_WORDS
#from spacy.lang.es.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('es_core_news_sm')

In [5]:
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
doc_l= ' '.join([token.lemma_ for token in doc])
print(doc_l)

apple and orange be similar . boot and hippo be not .


# Algoritmo TextRank

In [6]:
class TextRank4Keyword():
    """Extract keywords from text"""

    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 100 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        keysw={}
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            keysw[key] =value
            if i > number:
                break
        return keysw
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
        

# Algoritmo Grado de Fractalidad

In [7]:
#solamente se calcula el grado de fractalidad de las palabras que tengan mas de uno de frecuencia
def fractalidad(palabras,vocabulario,frec,dist):
    N=len(palabras)                                     #El número de tokens de todo el texto
    gf={}
    cajas_index=set()
    voc=[]                                             #la variable voc contendra cada sintagma con frecuencia mayor que 1, por que las otras palabras tendrán 0 de grado de fractaldiad
    for p in vocabulario:                              #Esto se puede hacer fuera del algoritmo, pero se incluye para evitar ese calculo innecesario 
        if(p not in voc):
            if(frec[p]>1):
                if(p not in STOP_WORDS):
                    if(len(p)>1):
                        voc.append(p)
    # print("Text size: ",N)
    # print("Vocabulary: ",len(voc))
    for p in voc:                                  
        rcajas=dist[p]
        M=frec[p]                                  
        dfw=0.0
        nsh=0.0
        for s in range(1,N+1):  
            noc=0                                       
            for e in rcajas:                       
                cajas_index.add(ceil(int(e)/s))    
            noc=len(cajas_index)                    
            cajas_index.clear()    
            ns=N/s
            if(M<=ns):
                nsh=M
            else:
                nsh=M/(1+(M-1)/(N-1)*(s-1)) 
            dfw=dfw+fabs(log(nsh/noc))
        gf[p]=dfw
    return gf    #regresamos un diccionario

In [8]:
def distribucion(palabras,vocabulario):
    N=len(palabras)
    ncajas=[]
    cajas={}
    frecuencias={}
    for p in vocabulario:
        ncajas.clear()
        i=0
        M=palabras.count(p)
        while(i<N):
            if(p == palabras[i]):
                ncajas.append(i+1)
            i=i+1
        frecuencias[p]=M
        cajas[p]=ncajas[:]
    return frecuencias,cajas

# Lectura de archivo de entrada

In [9]:
#Lectura de archivo para generación de vocabulario
def cargar_datos(filename):
    f=open(filename, "r") #tenemos que crear un directorio llamado InputData
    texto=f.read()
    #Pasar a minusculas
    texto=texto.lower()
    #Eliminar puntuación
    texto=texto.translate(str.maketrans('', '', string.punctuation))
    texto=texto.translate(str.maketrans('', '', '¿¡—“”0123456789’'))
    palabras=texto.split()
    textop=""
    #rearmamos el texto debido a ue existen carácteres especiales
    for w in palabras:
        textop=textop+w+' '
    return textop

DEFINICIÓN DEL NOMBRE DEL ARCHIVO A PROCESAR

Lectura de documentos

In [10]:
def lee_documento(filename='NULL',texto=''):
    if filename != 'NULL':
        texto=cargar_datos(filename)
    #obtenemos el vocabulario
    tokens=texto.split()
    vocabulario=[]
    for t in tokens:
        if(t not in vocabulario):
            vocabulario.append(t)
    #variables de procesamiento
    dist={}
    frec={}
    frec,dist=distribucion(tokens,vocabulario)
    return frec,dist,tokens,vocabulario,texto

# Ejecución de algoritmos y generación de archivos de salida

In [11]:
# lectura de documento de prueba
frec,dist,tokens,vocabulario,texto = lee_documento('data.txt')

## Grado de Fractalidad

In [12]:
#ejecución de algoritmo Grado de Fractalidad
def grado_de_fractalidad(tokens,vocabulario,frec,dist,regresa_kw=False,regresa_df=True,top_n=np.inf,escribe_arch=False):
    frac_x=fractalidad(tokens,vocabulario,frec,dist) 
    sorted_x = sorted(frac_x.items(), key=operator.itemgetter(1), reverse=True)
    # print('Time GF: '+str(elapsed_time))

    #Imprimir y guardar resultados de GF
    if regresa_df:
        if top_n != np.inf:
            df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x[:top_n]]
        else:
            df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x]
        #Ordenar resultados por medida combinada
        df.sort(key=lambda x: x[3],reverse=True)
        if regresa_kw==False:
            df = [dato[0] for dato in df]
            by_MC=pd.DataFrame(df, columns=['word'])
        else:   
            by_MC=pd.DataFrame(df, columns=['word','frecuency','Degree_of_fractality','Combined_measure'])
        if escribe_arch:
            by_MC.to_csv('GF.csv')
    else:
        if top_n != np.inf:
            df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x[:top_n]]
        else:
            df=[[t[0],frec[t[0]], t[1], t[1]*log10(frec[t[0]])] for t in sorted_x]
        #Ordenar resultados por medida combinada
        df.sort(key=lambda x: x[3],reverse=True)
        if regresa_kw==False:
            by_MC = [dato[0] for dato in df]
        else:
            by_MC = df
        if escribe_arch:
            print('\nNo se tiene implementada la escritura de archivo cuando regresa_df==False\n')
    return by_MC

def use_gf(texto,regresa_kw=False,regresa_df=False,top_n=np.inf,escribe_arch=False):
    tokens=texto.split()
    vocabulario=[]
    for t in tokens:
        if(t not in vocabulario):
            vocabulario.append(t)
    #variables de procesamiento
    dist={}
    frec={}
    frec,dist=distribucion(tokens,vocabulario)
    df = grado_de_fractalidad(tokens,vocabulario,frec,dist,regresa_kw,regresa_df,top_n,escribe_arch)
    return df

df1 = use_gf(texto,regresa_kw=True,regresa_df=True,top_n=10)
df1

Unnamed: 0,word,frecuency,Degree_of_fractality,Combined_measure
0,latin,4,167.127394,100.620717
1,content,3,205.903714,98.241038
2,humour,3,163.880039,78.19065
3,injected,3,163.20027,77.866318
4,dummy,2,191.726724,57.715495
5,type,2,191.726724,57.715495
6,line,2,179.412131,54.008433
7,internet,2,175.612973,52.864773
8,reproduced,2,169.604962,51.056181
9,typesetting,2,166.080279,49.995146


## TextRank

In [13]:
#ejecución de algoritmo de TextRank
# start_time = time()
def use_TextRank(texto,regresa_kw=False,regresa_df=False,top_n=np.inf,escribe_arch=False):
    tr4w = TextRank4Keyword()
    tr4w.analyze(texto, candidate_pos = ['NOUN','PROPN'], window_size=4, lower=False)
    kwTR=tr4w.get_keywords(100)

    #Guardar resultados de TextRank
    if regresa_df:
        if top_n!=np.inf:
            if regresa_kw==True:
                salida = [[key, kwTR[key]] for key in kwTR.keys()][:top_n]
                dftr=pd.DataFrame(salida, columns=['word', 'Index'])
            else:
                salida = list(kwTR.keys())[:top_n]
                dftr=pd.DataFrame(salida, columns=['word'])
        else:
            if regresa_kw==True:
                salida = [[key, kwTR[key]] for key in kwTR.keys()]
                dftr=pd.DataFrame(salida, columns=['word', 'Index'])
            else:
                salida = list(kwTR.keys())
                dftr=pd.DataFrame(salida, columns=['word'])
    else:
        if top_n!=np.inf:
            if regresa_kw==True:
                dftr = [[key, kwTR[key]] for key in kwTR.keys()][:top_n]
            else:
                dftr = list(kwTR.keys())[:top_n]
        else:
            if regresa_kw==True:
                dftr = [[key, kwTR[key]] for key in kwTR.keys()]
            else:
                dftr = list(kwTR.keys())
        # elapsed_time = time() - start_time
        # print('Time TextRank: '+str(elapsed_time))
        if escribe_arch:
            dftr.to_csv('TextRank.csv')
    return dftr

dftr = use_TextRank(texto,top_n=10)
dftr

['ipsum',
 'lorem',
 'text',
 's',
 'versions',
 'words',
 'book',
 'bc',
 'latin',
 'humour']

# 20 Newsgroups

In [14]:
def preprocess_text(r,remove_STW=True,lemmatize=False):
    def check_STW(palabras):
        texto = [w for w in palabras \
                 if not w in STOP_WORDS and len(w)>2]
        return texto
    r1=r.cadena.str.translate(\
          str.maketrans('','',string.digits))\
          .str.translate(\
          str.maketrans('','',string.punctuation))\
          .str.replace('«','', regex=True)\
          .str.replace('»','', regex=True).str.replace('(','', regex=True)\
          .str.replace('\n',' ', regex=True)\
          .str.replace(')','', regex=True).str.strip().str.lower()
    if remove_STW:
        r1=r1.to_frame().applymap(lambda x : x.split()).applymap(check_STW).applymap(lambda x: ' '.join(x))
    if lemmatize:
        if isinstance(r1, pd.DataFrame):
            r1=r1.applymap(lambda x : nlp(x)).applymap(lambda x: ' '.join([token.lemma_ for token in x]))        
        else:
            r1=r1.to_frame().applymap(lambda x : nlp(x)).applymap(lambda x: ' '.join([token.lemma_ for token in x]))        
    r.cadena = r1
    r=r.rename(columns={'cadena':'docs'})
    return r

In [15]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))

from pprint import pprint
# pprint(list(newsgroups_train.target_names))

cats=list(newsgroups_train.target_names)
n = len(cats)//2
cats1=cats[:n]
cats2=cats[n:]
# print(cats)
print(cats1)
print(cats2)
categories = [cats1,cats2]

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball']
['rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [16]:
def process_20NG(categories,n=20,th=5,remove_STW=True,lemmatize=False):
    data=[]
    for i,cats in enumerate(categories):
        newsgroups_train = fetch_20newsgroups(subset='train', 
                                        remove=('headers', 'footers', 'quotes'),
                                        categories=cats)
        df = pd.DataFrame(newsgroups_train.data, columns=['cadena'])
        df['target'] = pd.Series(newsgroups_train.target+len(cats)*i)
        df = preprocess_text(df,remove_STW=remove_STW,lemmatize=lemmatize)
        df['TR'] = df.docs.apply(lambda x : use_TextRank(x,top_n=n))
        df['GF'] = df.docs.apply(lambda x : use_gf(x,top_n=n))
        df=df.drop(df[df.TR.apply(lambda x: len(x) < th)].index).reset_index(drop=True)
        df=df.drop(df[df.GF.apply(lambda x: len(x) < th)].index).reset_index(drop=True)
        data.append(df)
    lista=range(len(categories))
    newsgroups_data = data[0]
    if len(lista)>1:
        for j in lista[1:]:
            newsgroups_data = newsgroups_data.append(data[j],ignore_index=True)
    newsgroups_data = newsgroups_data.sample(frac=1).reset_index(drop=True)
    # newsgroups_data.to_csv('newsgroups_train_data.csv')
    return newsgroups_data

In [17]:
newsgroups_train_data = process_20NG(categories,remove_STW=True,lemmatize=True)
newsgroups_train_data

Unnamed: 0,docs,target,TR,GF
0,koc respond article aprurartusdpaorg dbdurartu...,17,"[trebizon, time, soldier, fact, massacre, pers...","[persian, time, fact, trebizon, armenians, sol..."
1,clone count poor try clone go regular modifica...,2,"[adobe, level, book, implementation, file, ven...","[poor, clone, red, book, adobe, use, file, fol..."
2,nhl playoff result conference semifinal good s...,10,"[lead, det, period, van, tor, wing, canuck, ad...","[shot, save, det, van, period, canuck, red, wi..."
3,jsfrom staffordvaxwinonamsusedu john stafford ...,8,"[ride, stafford, dod, live, lucas, slmr, discl...","[bad, ride, dod, stafford, live, like, temporary]"
4,fact matter poverty imperfectly relate social ...,18,"[revolution, welfare, poverty, constitution, s...","[welfare, general, revolution, include, social..."
...,...,...,...,...
4929,original poster misquote reference tim author ...,15,"[god, jesus, bible, spirit, author, revelation...","[god, spirit, saul, jesus, bible, claim, revel..."
4930,need help zxa supertrapp slipon carb rejette m...,8,"[carb, tune, match, carbs, rejette, gear, bike...","[carb, tune, flow, cbr, fix, match, lean, new,..."
4931,source code macintosh pgp available anonymous ...,11,"[ftp, source, site, documentation, cipher, sig...","[public, source, key, executable, site, cipher..."
4932,yeah right let guy write piece title imply cas...,0,"[propaganda, title, piece, respond, discuss, e...","[respond, piece, propaganda, real, title, want]"


In [18]:
print(np.unique(newsgroups_train_data.target))
print(newsgroups_train_data.GF.values[:5])
print(newsgroups_train_data.TR.values[:5])
df = newsgroups_train_data.copy()
df

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[list(['persian', 'time', 'fact', 'trebizon', 'armenians', 'soldier', 'koc', 'armenian', 'leave', 'shirak', 'van', 'number', 'alive', 'army', 'massacre', 'story', 'destroy', 'slaughter', 'strike'])
 list(['poor', 'clone', 'red', 'book', 'adobe', 'use', 'file', 'follow', 'level', 'vendor', 'implementation', 'big', 'test', 'source', 'bug', 'pscriptdrv', 'design', 'printer', 'number', 'issue'])
 list(['shot', 'save', 'det', 'van', 'period', 'canuck', 'red', 'win', 'maple', 'courtnall', 'bure', 'coffey', 'linden', 'drake', 'att', 'powerplay', 'goal', 'winnipeg', 'vancouver', 'jet'])
 list(['bad', 'ride', 'dod', 'stafford', 'live', 'like', 'temporary'])
 list(['welfare', 'general', 'revolution', 'include', 'social', 'poverty', 'relate', 'typical', 'guess', 'section', 'constitution', 'article', 'inequality', 'instability', 'fact'])]
[list(['trebizon', 'time', 'soldier', 'fact', 'massacre', 'persian', 'story', 'armenians', 'slaught

Unnamed: 0,docs,target,TR,GF
0,koc respond article aprurartusdpaorg dbdurartu...,17,"[trebizon, time, soldier, fact, massacre, pers...","[persian, time, fact, trebizon, armenians, sol..."
1,clone count poor try clone go regular modifica...,2,"[adobe, level, book, implementation, file, ven...","[poor, clone, red, book, adobe, use, file, fol..."
2,nhl playoff result conference semifinal good s...,10,"[lead, det, period, van, tor, wing, canuck, ad...","[shot, save, det, van, period, canuck, red, wi..."
3,jsfrom staffordvaxwinonamsusedu john stafford ...,8,"[ride, stafford, dod, live, lucas, slmr, discl...","[bad, ride, dod, stafford, live, like, temporary]"
4,fact matter poverty imperfectly relate social ...,18,"[revolution, welfare, poverty, constitution, s...","[welfare, general, revolution, include, social..."
...,...,...,...,...
4929,original poster misquote reference tim author ...,15,"[god, jesus, bible, spirit, author, revelation...","[god, spirit, saul, jesus, bible, claim, revel..."
4930,need help zxa supertrapp slipon carb rejette m...,8,"[carb, tune, match, carbs, rejette, gear, bike...","[carb, tune, flow, cbr, fix, match, lean, new,..."
4931,source code macintosh pgp available anonymous ...,11,"[ftp, source, site, documentation, cipher, sig...","[public, source, key, executable, site, cipher..."
4932,yeah right let guy write piece title imply cas...,0,"[propaganda, title, piece, respond, discuss, e...","[respond, piece, propaganda, real, title, want]"


In [19]:
newsgroups_train_data.to_csv('newsgroups_train_data_sinSW_lemma.csv')

# Word2vec

In [20]:
# import modules & set up logging
from gensim.models import Word2Vec as w2v
import gensim.downloader as api

def vocdf(df,vec_size=100):
    # obtain vocabulary word types 
    types=df['docs'].str.split(' ', expand=True).stack().unique()
    # Data Frame of vocabulary and word embeddings
    typesDF=pd.Series(types).to_frame()
    typesDF.rename(index=int,columns={0:'Palabra'},inplace=True)

    #Add Emebddings placeholders
    #Se necesita convertir el DF a diccionario
    #luego se agregan vectores de dimension N,
    #como registros nuevos del diccionario
    #para reconvertirlo en un DF de vuelta
    dico=typesDF.to_dict('records',into=OrderedDict)
    #Add real-valued embedding vectors
    for reg in dico:
        reg['W2V']=np.zeros(vec_size)
    typesDF=pd.DataFrame.from_dict(dico)
    typesDF.set_index('Palabra',inplace=True)
    return typesDF

def compute_embeddings(df,typesdf,vec_dim=100):
    #Nos quedamos con la columna que nos importa, la que contiene las cadenas
    docs = df["docs"].values.tolist()
    docs = [s.split() for s in docs]
    model = w2v(docs, min_count=1, vector_size=vec_dim)
    words = typesdf.index.values.tolist()
    for w in words:
        typesdf.at[w,'W2V'] = model.wv[w]
    return

typesDF = vocdf(df)
print(typesDF.iloc[0].W2V.shape)

compute_embeddings(df,typesDF)
print(typesDF.loc['sound'])
typesDF.head()
# model.wv['sound']
# model.wv.most_similar('good', topn=10) 

(100,)
W2V    [0.63286227, 1.7717185, -0.47656956, 0.7558266...
Name: sound, dtype: object


Unnamed: 0_level_0,W2V
Palabra,Unnamed: 1_level_1
koc,"[0.007138354, 0.002010935, -0.03214333, 0.0123..."
respond,"[-0.016314097, 0.905852, -0.80287933, 0.102970..."
article,"[-0.21221071, 0.28731912, -1.1636721, -0.44954..."
aprurartusdpaorg,"[0.0031581398, 0.0013548204, -0.018494118, -0...."
dbdurartu,"[-0.008125715, 0.005395709, -0.009346178, 0.00..."


# Construye los arreglos de features

In [21]:
df['VTR'] = df.TR.apply(lambda x: np.array([vec for word in x for vec in typesDF.loc[word]]))
df.head()

Unnamed: 0,docs,target,TR,GF,VTR
0,koc respond article aprurartusdpaorg dbdurartu...,17,"[trebizon, time, soldier, fact, massacre, pers...","[persian, time, fact, trebizon, armenians, sol...","[[0.0017461785, 0.051693786, -0.016429123, 0.0..."
1,clone count poor try clone go regular modifica...,2,"[adobe, level, book, implementation, file, ven...","[poor, clone, red, book, adobe, use, file, fol...","[[0.16018964, 0.12162198, -0.10424078, -0.0826..."
2,nhl playoff result conference semifinal good s...,10,"[lead, det, period, van, tor, wing, canuck, ad...","[shot, save, det, van, period, canuck, red, wi...","[[0.587381, 1.2823002, -0.26724386, 0.3588745,..."
3,jsfrom staffordvaxwinonamsusedu john stafford ...,8,"[ride, stafford, dod, live, lucas, slmr, discl...","[bad, ride, dod, stafford, live, like, temporary]","[[0.66758555, 1.4934801, -0.28840137, 0.179831..."
4,fact matter poverty imperfectly relate social ...,18,"[revolution, welfare, poverty, constitution, s...","[welfare, general, revolution, include, social...","[[0.26836053, 0.33547762, -0.1683432, -0.01250..."


In [22]:
df['VGF'] = df.GF.apply(lambda x: np.array([vec for word in x for vec in typesDF.loc[word]]))
df.head()

Unnamed: 0,docs,target,TR,GF,VTR,VGF
0,koc respond article aprurartusdpaorg dbdurartu...,17,"[trebizon, time, soldier, fact, massacre, pers...","[persian, time, fact, trebizon, armenians, sol...","[[0.0017461785, 0.051693786, -0.016429123, 0.0...","[[0.20330037, 0.26017433, -0.11627525, -0.0083..."
1,clone count poor try clone go regular modifica...,2,"[adobe, level, book, implementation, file, ven...","[poor, clone, red, book, adobe, use, file, fol...","[[0.16018964, 0.12162198, -0.10424078, -0.0826...","[[0.5351642, 1.1958764, -0.3077469, 0.2469886,..."
2,nhl playoff result conference semifinal good s...,10,"[lead, det, period, van, tor, wing, canuck, ad...","[shot, save, det, van, period, canuck, red, wi...","[[0.587381, 1.2823002, -0.26724386, 0.3588745,...","[[0.7719148, 1.3434646, -0.37406495, 0.0462163..."
3,jsfrom staffordvaxwinonamsusedu john stafford ...,8,"[ride, stafford, dod, live, lucas, slmr, discl...","[bad, ride, dod, stafford, live, like, temporary]","[[0.66758555, 1.4934801, -0.28840137, 0.179831...","[[0.6132214, 2.5334766, -0.33142954, 0.8282831..."
4,fact matter poverty imperfectly relate social ...,18,"[revolution, welfare, poverty, constitution, s...","[welfare, general, revolution, include, social...","[[0.26836053, 0.33547762, -0.1683432, -0.01250...","[[0.18528216, 0.31097674, -0.17614472, 0.04937..."


## Padding de arreglos

In [156]:
def padding(X,n,vec_dim):
    pad_vec = np.zeros(vec_dim)
    for i,x in enumerate(X):
        if x[0].shape[0] != n:
            fn = n - x[0].shape[0]
            padx = x[0].tolist()
            for j in range(fn):
                padx.append(pad_vec)
            X[i][0]=np.array(padx)
        else:
            X[i][0]=x[0]
    return

def repadding(X,n,vec_dim):
    pad_vec = np.zeros(vec_dim)
    newX = []
    for x in X:
        if x.shape[0] != n:
            fn = n - x.shape[0]
            padx = x.tolist()
            for j in range(fn):
                padx.append(pad_vec)
            newX.append(padx)
        else:
            newX.append(x)
    return np.array(newX)

In [157]:
X_VTR = df.VTR.values
X_VTR = X_VTR.reshape(X_VTR.shape[0],1)
y_VTR = df.target.values
y_VTR = y_VTR.reshape(y_VTR.shape[0],1)
VTR_train = np.hstack((X_VTR,y_VTR))
padding(VTR_train,20,100)
VTR_train = pd.DataFrame(VTR_train,columns=['vec','target'])
VTR_train

Unnamed: 0,vec,target
0,"[[0.0017461785, 0.051693786, -0.016429123, 0.0...",17
1,"[[0.16018964, 0.12162198, -0.10424078, -0.0826...",2
2,"[[0.587381, 1.2823002, -0.26724386, 0.3588745,...",10
3,"[[0.66758555, 1.4934801, -0.28840137, 0.179831...",8
4,"[[0.26836053, 0.33547762, -0.1683432, -0.01250...",18
...,...,...
4929,"[[0.37792504, 2.9643931, -0.88236153, 1.737902...",15
4930,"[[0.20676619, 0.26823014, -0.16241468, 0.01207...",8
4931,"[[0.77825636, 0.29552558, -1.9968874, -0.85939...",11
4932,"[[0.32636356, 0.515005, -0.31521046, 0.0769180...",0


In [158]:
X_VGF = df.VTR.values
X_VGF = X_VGF.reshape(X_VGF.shape[0],1)
y_VGF = df.target.values
y_VGF = y_VGF.reshape(y_VGF.shape[0],1)
VGF_train = np.hstack((X_VGF,y_VGF))
padding(VGF_train,20,100)
VGF_train = pd.DataFrame(VGF_train,columns=['vec','target'])
VGF_train

Unnamed: 0,vec,target
0,"[[0.0017461785, 0.051693786, -0.016429123, 0.0...",17
1,"[[0.16018964, 0.12162198, -0.10424078, -0.0826...",2
2,"[[0.587381, 1.2823002, -0.26724386, 0.3588745,...",10
3,"[[0.66758555, 1.4934801, -0.28840137, 0.179831...",8
4,"[[0.26836053, 0.33547762, -0.1683432, -0.01250...",18
...,...,...
4929,"[[0.37792504, 2.9643931, -0.88236153, 1.737902...",15
4930,"[[0.20676619, 0.26823014, -0.16241468, 0.01207...",8
4931,"[[0.77825636, 0.29552558, -1.9968874, -0.85939...",11
4932,"[[0.32636356, 0.515005, -0.31521046, 0.0769180...",0


# Setup de datos para clasificación

In [159]:
VTR_val = VTR_train.sample(frac=0.1)
VTR_train = VTR_train.drop(labels=VTR_val.index)

In [160]:
X_VTR_train = VTR_train.vec.values
X_VTR_train = repadding(X_VTR_train,20,100)
print(X_VTR_train.shape)
y_VTR_train = VTR_train.target.values
print(y_VTR_train.shape)

X_VTR_val = VTR_val.vec.values
X_VTR_val = repadding(X_VTR_val,20,100)
print(X_VTR_val.shape)
y_VTR_val = VTR_val.target.values
print(y_VTR_val.shape)

(4441, 20, 100)
(4441,)
(493, 20, 100)
(493,)


In [161]:
VGF_val = VGF_train.sample(frac=0.1)
VGF_train = VGF_train.drop(labels=VGF_val.index)

In [162]:
X_VGF_train = VGF_train.vec.values
X_VGF_train = unpack(X_VGF_train,20,100)
print(X_VGF_train.shape)
y_VGF_train = VGF_train.target.values
print(y_VGF_train.shape)

X_VGF_val = VGF_val.vec.values
X_VGF_val = unpack(X_VGF_val,20,100)
print(X_VGF_val.shape)
y_VGF_val = VGF_val.target.values
print(y_VGF_val.shape)

(4441, 20, 100)
(4441,)
(493, 20, 100)
(493,)


In [163]:
np.save('X_VTR_train.npy', X_VTR_train)
np.save('y_VTR_train.npy', y_VTR_train)
np.save('X_VTR_val.npy', X_VTR_val)
np.save('y_VTR_val.npy', y_VTR_val)

np.save('X_VGF_train.npy', X_VGF_train)
np.save('y_VGF_train.npy', y_VGF_train)
np.save('X_VGF_val.npy', X_VGF_val)
np.save('y_VGF_val.npy', y_VGF_val)

In [166]:
X_VTR_train = np.load('X_VTR_train.npy', allow_pickle=True)
y_VTR_train = np.load('y_VTR_train.npy', allow_pickle=True)
X_VTR_val = np.load('X_VTR_val.npy', allow_pickle=True)
y_VTR_val = np.load('y_VTR_val.npy', allow_pickle=True)

X_VGF_train = np.load('X_VGF_train.npy', allow_pickle=True)
y_VGF_train = np.load('y_VGF_train.npy', allow_pickle=True)
X_VGF_val = np.load('X_VGF_val.npy', allow_pickle=True)
y_VGF_val = np.load('y_VGF_val.npy', allow_pickle=True)

In [168]:
print(X_VTR_train.shape, y_VTR_train.shape)
print(X_VGF_train.shape, y_VGF_train.shape)

print(X_VTR_val.shape, y_VTR_val.shape)
print(X_VGF_val.shape, y_VGF_val.shape)

(4441, 20, 100) (4441,)
(4441, 20, 100) (4441,)
(493, 20, 100) (493,)
(493, 20, 100) (493,)


In [30]:

X_VTR_train = X_VTR_train.reshape(X_VTR_train.shape[0], X_VTR_train[0].shape[0], X_VTR_train[0].shape[1], 1)
X_VTR_train = X_VTR_train.astype('float32')

X_VGF_train = X_VGF_train.reshape(X_VGF_train.shape[0], X_VGF_train[0].shape[0], X_VGF_train[0].shape[1], 1)
X_VGF_train = X_VGF_train.astype('float32')

X_VTR_val = X_VTR_val.reshape(X_VTR_val.shape[0], X_VTR_val[0].shape[0], X_VTR_val[0].shape[1], 1)
X_VTR_val = X_VTR_val.astype('float32')

X_VGF_val = X_VGF_val.reshape(X_VGF_val.shape[0], X_VGF_val[0].shape[0], X_VGF_val[0].shape[1], 1)
X_VGF_val = X_VGF_val.astype('float32')

print(X_VTR_train.shape)
print(y_VTR_train.shape)

print(X_VGF_train.shape)
print(y_VGF_train.shape)

ValueError: cannot reshape array of size 4441 into shape (4441,20,100,1)

# Clasificación usando CNN

In [23]:
# "images" sizes
n1 = X_train.shape[1]
n2 = X_train.shape[2]

print("Las imágenes son de ",n1,"x",n2)

X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], X_val.shape[2], 1)
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
X_train = X_train.astype('float32')
# X_test = X_test.astype('float32')

# normalizing the data to help with the training
# X_train /= 255
# X_test /= 255

# one-hot encoding using keras' numpy-related utilities
n_classes = 20
print("Shape before one-hot encoding: ", y_train.shape)
Y_train = np_utils.to_categorical(y_train, n_classes)
Y_val = np_utils.to_categorical(y_val, n_classes)
# Y_test = np_utils.to_categorical(y_test, n_classes)
print("Shape after one-hot encoding: ", Y_train.shape)


model = Sequential()
# convolutional layer
model.add(Conv2D(25, kernel_size=(3,3), strides=(1,1), padding='valid', activation='relu', input_shape=(n1,n2,1)))
model.add(MaxPool2D(pool_size=(1,1)))
# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(100, activation='relu'))
# output layer
model.add(Dense(20, activation='softmax'))

# compiling the sequential model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# training the model 

early_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3)
history = model.fit(X_train, Y_train, batch_size=32, epochs=100, validation_data=(X_val, Y_val)
            ,callbacks=[early_callback])

# score = model.evaluate(X_test, Y_test)

