In [65]:
from keras.preprocessing.text import Tokenizer
import unidecode

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
stop_words=set(stopwords.words('spanish'))
stemmer = SnowballStemmer('spanish')

import re, string
url = re.compile(r'https?://[\w./\-?=&+]+')
mentions = re.compile(r'((?<=\W)|^)(@\w+)(\s*@\w+)*')
email = re.compile(r'[\w.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+')
punctuation = re.compile('[%s]' % re.escape(string.punctuation + '…'))
length = re.compile(r"(.)\1{2,}")


def construir_corpus(tweets, limit=10000):
    corpus=[]
    i=0
    for t in tweets:
        if i>limit:
            break
        for w in clean_sentence(t):
            stemmed = stemmer.stem(w)
            if ok_word(w) and stemmed not in corpus:
                corpus.append(stemmed)
                i+=1
    return corpus

def ok_word(we):
    # Has at least 1 letter (not alone numbers nor punctuanction symbols)
    if re.match(r"(?=[a-z])", we) and we not in stop_words:
        return True
    return False

def clean_sentence(sentence):
    clean_sentence = []
    for word in sentence.lower().split():
        unaccented_word = unidecode.unidecode(word)  # sacar tildes
        word = url.sub('url', word)                  # cambiar link por URL
        word = mentions.sub('user', word)            # cambiar mencion por USER
        word = email.sub('mail', word)               # cambiar mail por MAIL
        word = punctuation.sub('', word)             # sacar signos de puntuacion (., !, :)
        word = length.sub(r"\1\1", word)             # sacar letras repetidas 2 o mas veces
        text = text.strip()

        clean_sentence.append(word)
    return clean_sentence

In [66]:
import pandas as pd
df=pd.read_csv(r'tweets_worldwide_hamburguesa.csv')
df=df[df.langs == 'es']
df.shape[0]
sentences = df['texts']


In [69]:
corpus = construir_corpus(sentences)
len(corpus)

6227

In [72]:
t = Tokenizer()
t.fit_on_texts(corpus)
encoded_corpus=t.texts_to_matrix(corpus, mode='count')
print(len(encoded_corpus))
print(encoded_corpus[0])

6227
[0. 1. 0. ... 0. 0. 0.]
