In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

In [2]:
import time 
t0 = time.time()

In [3]:
import pandas as pd

In [4]:
spark = sparknlp.start(gpu=True)

In [5]:
spark

# Part of Speech

In [6]:
df = pd.read_csv('datasets/CorpusDemo.csv')
df = df[['CONTENT']].drop_duplicates().reset_index(drop=True)

In [7]:
import re
def clean_text(text):
    text = re.sub('[#@][^\t\n\r\f\v\s]*', " ", text)
    text = re.sub('[^\w\d\:\/\.\-\_\,\(\)]', " ", text)
    text = re.sub('(http|www)[^\t\n\r\f\v\s]*', " ", text)
    text = re.sub('\s+', ' ', text)
    return text

In [8]:
text_df = spark.createDataFrame(df)
text_df.show(10)

+--------------------+
|             CONTENT|
+--------------------+
|Los proyectos #Fr...|
|Un peque침o pero c...|
|Colombia ha debid...|
|Es una Mierda #Fr...|
|"El planeta sufre...|
|游댰游댲Descargue aqu...|
|Pilotos de #frack...|
|#IMPORTANTE  Los ...|
|Los que creen que...|
|Interesante art칤c...|
+--------------------+
only showing top 10 rows



In [9]:
from pyspark.sql import functions as F

udf_clean_text = F.udf(lambda x: clean_text(x))

In [10]:
text_df = text_df.select(udf_clean_text(F.col('CONTENT')).alias('CONTENT'))

### Custom lemmatizer

In [11]:
from es_lemmatizer import lemmatize
import spacy

nlp = spacy.load("es_core_news_sm")
nlp.add_pipe(lemmatize, after="tagger")


try:
    doc = nlp(''.join(text_df.select('CONTENT').rdd.flatMap(lambda x: x).collect()))
except:
    doc = nlp(text)

custom_lemm = {}
for token in doc:
    if token.lemma_ not in custom_lemm:
        custom_lemm[token.lemma_] = [str(token)]
    else:
        if str(token) not in custom_lemm[token.lemma_]:
            custom_lemm[token.lemma_].append(str(token))
        
        
keys = list(custom_lemm.keys())
vals = ['\t'.join(entry) for entry in list(custom_lemm.values())]

In [12]:
keys[keys.index('q')] = 'que'
keys[keys.index('d')] = 'de'

In [13]:
custom_lemm_list = [f'{key}->{val}\n' for key, val in zip(keys, vals)]

In [14]:
with open('customLemmanoSpellCheck.txt', 'w') as file:
    file.writelines(custom_lemm_list)

### Pipeline 1-grams

In [15]:
documentAssembler = DocumentAssembler()\
    .setInputCol("CONTENT")\
    .setOutputCol('document')

In [16]:
tokenizer = Tokenizer()\
    .setInputCols('document')\
    .setOutputCol('token')

In [17]:
lemmatizer = Lemmatizer()\
    .setInputCols('token')\
    .setOutputCol("lemma")\
    .setDictionary('customLemmanoSpellCheck.txt', '->', '\t')

In [18]:
# import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords

es_stopwords = stopwords.words('spanish')
es_stopwords.extend(['de+el', 'a+el'])

In [19]:
stopwordsCleaner = StopWordsCleaner()\
    .setInputCols('lemma')\
    .setOutputCol('1-gram')\
    .setStopWords(es_stopwords)

In [20]:
nGrammer = NGramGenerator()\
    .setInputCols('1-gram')\
    .setOutputCol('n-grams')\
    .setN(3)\
    .setEnableCumulative(True)\
    .setDelimiter('_')

In [21]:
posTagger = PerceptronModel.pretrained("pos_ud_gsd", "es")\
    .setInputCols(['document', '1-gram'])\
    .setOutputCol('posTagger')

pos_ud_gsd download started this may take some time.
Approximate size to download 5.2 MB
[OK!]


In [22]:
finisher = Finisher()\
    .setInputCols(['1-gram', 'n-grams', 'posTagger'])

In [23]:
from pyspark.ml import Pipeline
pipeline = Pipeline() \
    .setStages([documentAssembler,
#                 sentenceDetector,
                tokenizer,
                lemmatizer,
                stopwordsCleaner,
                nGrammer,
                posTagger,
                finisher])

In [24]:
processed_texts = pipeline.fit(text_df).transform(text_df)

In [25]:
processed_texts.show()

+--------------------+--------------------+--------------------+--------------------+
|             CONTENT|     finished_1-gram|    finished_n-grams|  finished_posTagger|
+--------------------+--------------------+--------------------+--------------------+
|Los proyectos se ...|[proyecto, ubicar...|[proyecto, ubicar...|[NOUN, VERB, NOUN...|
|Un peque침o pero c...|[peque침o, complet...|[peque침o, complet...|[ADJ, ADJ, NOUN, ...|
|Colombia ha debid...|[colombia, haber,...|[colombia, haber,...|[PROPN, AUX, VERB...|
|      Es una Mierda |            [mierda]|            [mierda]|             [PROPN]|
| El planeta sufre...|[planeta, sufrir,...|[planeta, sufrir,...|[NOUN, VERB, NOUN...|
| Descargue aqu칤 e...|[descargue, aqu칤,...|[descargue, aqu칤,...|[VERB, ADV, NOUN,...|
|Pilotos de de Eco...|[piloto, ecopetro...|[piloto, ecopetro...|[NOUN, PROPN, AUX...|
| Los pilotos de d...|[piloto, ecopetro...|[piloto, ecopetro...|[NOUN, NOUN, AUX,...|
|Los que creen que...|[crear, va, quita...|[crear, va,

### Handling n-grams

In [26]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [27]:
# User defined function to join the list
udf_join_arr = F.udf(lambda x: ' '.join(x), T.StringType())

In [28]:
processed_texts = processed_texts.withColumn('finished_posTagger',  udf_join_arr(F.col('finished_posTagger')))

In [29]:
processed_texts.show()

+--------------------+--------------------+--------------------+--------------------+
|             CONTENT|     finished_1-gram|    finished_n-grams|  finished_posTagger|
+--------------------+--------------------+--------------------+--------------------+
|Los proyectos se ...|[proyecto, ubicar...|[proyecto, ubicar...|NOUN VERB NOUN NO...|
|Un peque침o pero c...|[peque침o, complet...|[peque침o, complet...|ADJ ADJ NOUN NOUN...|
|Colombia ha debid...|[colombia, haber,...|[colombia, haber,...|PROPN AUX VERB VE...|
|      Es una Mierda |            [mierda]|            [mierda]|               PROPN|
| El planeta sufre...|[planeta, sufrir,...|[planeta, sufrir,...|NOUN VERB NOUN AD...|
| Descargue aqu칤 e...|[descargue, aqu칤,...|[descargue, aqu칤,...|VERB ADV NOUN NOU...|
|Pilotos de de Eco...|[piloto, ecopetro...|[piloto, ecopetro...|NOUN PROPN AUX VE...|
| Los pilotos de d...|[piloto, ecopetro...|[piloto, ecopetro...|NOUN NOUN AUX VER...|
|Los que creen que...|[crear, va, quita...|[crear, va,

In [30]:
posDocumentAssembler = DocumentAssembler()\
    .setInputCol('finished_posTagger')\
    .setOutputCol('pos_document')

In [31]:
posTokenizer = Tokenizer()\
    .setInputCols('pos_document')\
    .setOutputCol('pos')

In [32]:
posNGrammer = NGramGenerator()\
    .setInputCols('pos')\
    .setOutputCol('pos_ngrams')\
    .setN(3)\
    .setEnableCumulative(True)\
    .setDelimiter('_')

In [33]:
posFinisher = Finisher()\
    .setInputCols(['pos', 'pos_ngrams'])

In [34]:
posPipeline = Pipeline()\
    .setStages([posDocumentAssembler,
                posTokenizer,
                posNGrammer,
                posFinisher])

In [35]:
processed_texts = posPipeline.fit(processed_texts).transform(processed_texts)

In [36]:
processed_texts.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             CONTENT|     finished_1-gram|    finished_n-grams|  finished_posTagger|        finished_pos| finished_pos_ngrams|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Los proyectos se ...|[proyecto, ubicar...|[proyecto, ubicar...|NOUN VERB NOUN NO...|[NOUN, VERB, NOUN...|[NOUN, VERB, NOUN...|
|Un peque침o pero c...|[peque침o, complet...|[peque침o, complet...|ADJ ADJ NOUN NOUN...|[ADJ, ADJ, NOUN, ...|[ADJ, ADJ, NOUN, ...|
|Colombia ha debid...|[colombia, haber,...|[colombia, haber,...|PROPN AUX VERB VE...|[PROPN, AUX, VERB...|[PROPN, AUX, VERB...|
|      Es una Mierda |            [mierda]|            [mierda]|               PROPN|             [PROPN]|             [PROPN]|
| El planeta sufre...|[planeta, sufrir,...|[planeta, sufrir,...|NOUN VERB NOUN AD...|[NOUN, VERB, NOUN..

### Filtering POSTags

These tags mark the core part-of-speech categories.

__Alphabetical listing__:
- ADJ: adjective (noun modifiers)
- ADP: adposition (preposiciones y postposiciones, e.g., in, to, during)
- ADV: adverb (verb -sometines also adjective- modifiers)
- AUX: auxiliary 
- CCONJ: coorinating conjuction (links words without subordination)
- DET: determiner
- INTJ: interjection
- NOUN: noun
- NUM: numeral
- PART: particle
- PRON: pronoun
- PROPN: proper noun
- PUNCT: punctuation
- SCONJ: subordinating conjunction
- SYM: symbol
- VERB: verb
- X: other

Reference: https://universaldependencies.org/u/pos/

#### Filtering 1-grams

In [37]:
allowed_cats = ['NUM', 'ADJ', 'NOUN', 'PROPN', 'VERB', 'ADV', 'X']

def filter_pos(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags) if pos in allowed_cats]

udf_filter_pos = F.udf(filter_pos, T.ArrayType(T.StringType()))

In [38]:
processed_texts = processed_texts.withColumn('filtered_1-gram', udf_filter_pos(F.col('finished_1-gram'), F.col('finished_pos')))

In [39]:
processed_texts.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             CONTENT|     finished_1-gram|    finished_n-grams|  finished_posTagger|        finished_pos| finished_pos_ngrams|     filtered_1-gram|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Los proyectos se ...|[proyecto, ubicar...|[proyecto, ubicar...|NOUN VERB NOUN NO...|[NOUN, VERB, NOUN...|[NOUN, VERB, NOUN...|[proyecto, ubicar...|
|Un peque침o pero c...|[peque침o, complet...|[peque침o, complet...|ADJ ADJ NOUN NOUN...|[ADJ, ADJ, NOUN, ...|[ADJ, ADJ, NOUN, ...|[peque침o, complet...|
|Colombia ha debid...|[colombia, haber,...|[colombia, haber,...|PROPN AUX VERB VE...|[PROPN, AUX, VERB...|[PROPN, AUX, VERB...|[colombia, debido...|
|      Es una Mierda |            [mierda]|            [mierda]|               PROPN|             [PROPN]|

#### Filtering n-grams

In [40]:
# Add punctuation

filter_3 = ['NUM_ADV_VERB', 'NUM_ADV_ADJ', 'NUM_ADJ_NOUN', 'NUM_NOUN_VERB', 'NUM_NOUN_ADJ',\
            'PROPN_PROPN_PROPN', 'PROPN_VERB_PROPN', 'PROPN_VERB_NOUN', 'PROPN_VERB_ADV', 'PROPN_ADJ_VERB', 'PROPN_ADV_ADJ', 'PROPN_ADV_VERB',\
            'NOUN_PROPN_PROPN', 'NOUM_VERB_NOUN', 'NOUN_VERB_PROPN', 'NOUN_PROPN_VERB', 'NOUM_VERB_NUM', 'NOUN_NUM_VERB', 'NOUN_VERB_ADV',\
            'VERB_ADJ_PROPN', 'VERB_ADJ_NOUN', 'VERB_PROPN_PROPN', 'VERB_NOUN_NOUN', 'VERB_NOUN_PROPN', 'VERB_NOUN_ADJ', 'VERB_PROPN_NOUN', 'VERB_NUM_NOUN', 'VERB_ADV_ADV',\
            'ADJ_NOUN_VERB', 'ADJ_PROPN_VERB', 'ADJ_PROPN_PROPN', 'ADJ_NOUN_PROPN', 'ADJ_PROPN_NOUN', 'ADJ_NOUN_NOUN', 'ADJ_NOUN_ADJ', 'ADJ_PROPN_ADJ', 'ADJ_VERB_NOUN',\
            'ADV_VERB_PROPN', 'ADV_VERB_NOUN']
filter_2 = ['PROPN_PROPN', 'PROPN_NOUN', 'PROPN_ADJ', 'PROPN_VERB',\
            'NOUN_NOUN', 'NOUN_PROPN', 'NOUN_VERB', 'NOUN_ADJ',\
            'NUM_NOUN',\
            'ADJ_NOUN', 'ADJ_PROPN',\
            'ADV_VERB', 'ADV_ADJ',\
            'VERB_NOUN', 'VERB_PROPN', 'VERB_ADV', 'VERB_ADJ']

def filter_pos_ngrams(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags)\
            if (len(pos.split('_')) == 3\
                and\
               pos in filter_3)\
           or (len(pos.split('_')) == 2\
              and\
              pos in filter_2)]

udf_filter_pos_combs = F.udf(filter_pos_ngrams, T.ArrayType(T.StringType()))

In [41]:
processed_texts = processed_texts.withColumn('filtered_ngrams',udf_filter_pos_combs(F.col('finished_n-grams'), F.col('finished_pos_ngrams')))
processed_texts.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             CONTENT|     finished_1-gram|    finished_n-grams|  finished_posTagger|        finished_pos| finished_pos_ngrams|     filtered_1-gram|     filtered_ngrams|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Los proyectos se ...|[proyecto, ubicar...|[proyecto, ubicar...|NOUN VERB NOUN NO...|[NOUN, VERB, NOUN...|[NOUN, VERB, NOUN...|[proyecto, ubicar...|[proyecto_ubicar,...|
|Un peque침o pero c...|[peque침o, complet...|[peque침o, complet...|ADJ ADJ NOUN NOUN...|[ADJ, ADJ, NOUN, ...|[ADJ, ADJ, NOUN, ...|[peque침o, complet...|[completo_an치lisi...|
|Colombia ha debid...|[colombia, haber,...|[colombia, haber,...|PROPN AUX VERB VE...|[PROPN, AUX, VERB...|[PROPN, AUX, VERB...|[colombia, debido...|[7

In [42]:
list([(x, y) for x,y in zip(processed_texts.select('finished_1-gram').toPandas()['finished_1-gram'].tolist()[0], processed_texts.select('finished_posTagger').toPandas()['finished_posTagger'].str.split(' ').tolist()[0])])

[('proyecto', 'NOUN'),
 ('ubicar', 'VERB'),
 ('desembocadura', 'NOUN'),
 ('r칤o', 'NOUN'),
 ('sogamoso', 'PROPN'),
 (',', 'PUNCT'),
 ('2', 'NUM'),
 ('recurso', 'NOUN'),
 ('importante', 'ADJ'),
 ('tener', 'VERB'),
 ('zona', 'NOUN'),
 ('humedal', 'ADJ'),
 ('ca침o', 'ADJ'),
 ('interconectar', 'VERB'),
 ('r칤os', 'PROPN'),
 ('ci칠naga', 'PROPN'),
 ('aun', 'SCONJ'),
 ('habitar', 'VERB'),
 ('manat칤', 'ADV')]

In [43]:
# processed_texts.select('filtered_ngrams').toPandas().filtered_ngrams.tolist()

# Joiner and Vectorizer

In [44]:
from pyspark.sql.functions import concat

processed_texts = processed_texts.withColumn('final', concat(F.col('filtered_1-gram'), F.col('filtered_ngrams')))

In [45]:
# TF: Term Frequency

from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol='final',\
                         outputCol='tf_features')
tf_model = tfizer.fit(processed_texts)
tf_result = tf_model.transform(processed_texts)

In [46]:
# tf_result.select('tf_features').toPandas().tf_features.tolist()

In [47]:
# IDF: Inverse Document Frequency

from pyspark.ml.feature import IDF

idfizer = IDF(inputCol='tf_features',\
              outputCol='tf_idf_features')
tfidf_result = idfizer.fit(tf_result).transform(tf_result)

In [48]:
# tfidf_result.select('tf_idf_features').toPandas().tf_idf_features.tolist()

# LDA

In [49]:
from pyspark.ml.clustering import LDA

num_topics = 6
max_iter = 10

lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features', seed=24)
lda_model = lda.fit(tfidf_result)

In [50]:
vocab = tf_model.vocabulary

def get_words(token_list):
    return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [51]:
num_top_words = 150


topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show()

+-----+--------------------+
|topic|          topicWords|
+-----+--------------------+
|    0|[hacer, agua, pet...|
|    1|[recurso, favor, ...|
|    2|[social, licencia...|
|    3|[ley, convenciona...|
|    4|[fracking, ecopet...|
|    5|[piloto, presiden...|
+-----+--------------------+



In [52]:
lda_model.describeTopics().show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[3, 5, 15, 1, 2, ...|[0.00201077417876...|
|    1|[43, 11, 136, 123...|[0.00172384546465...|
|    2|[41, 233, 105, 52...|[0.00152803965447...|
|    3|[20, 110, 17, 39,...|[0.00141001902481...|
|    4|[2, 103, 142, 4, ...|[0.00130768684606...|
|    5|[1, 31, 21, 48, 0...|[0.00178757147594...|
+-----+--------------------+--------------------+



In [53]:
topics.toPandas().iloc[1]['topicWords']

['recurso',
 'favor',
 'territorios',
 '2',
 'gobierno',
 'menos',
 'buscar',
 'vida',
 'pa칤s',
 'hacer',
 'venir',
 'proteger',
 'defender',
 'colombiano',
 'solo',
 'pueblo',
 'votar',
 'pasar',
 'vivir',
 'dar',
 'decir',
 'recurso_natural',
 'querer',
 'ecosistemas',
 'natural',
 'justificar',
 'cosa',
 'audiencia',
 'art칤culo',
 'representante',
 'dejar',
 'matar',
 'tal',
 'territorio',
 'campa침a',
 'verdad',
 'aprobar',
 'cambio',
 '1',
 'mas',
 'medio',
 'magdalena',
 'senador',
 'ciudadan칤a',
 'proyecto',
 'mensaje',
 'c치mara',
 'empresa',
 'hoy',
 'fuente',
 'p치ramo',
 'ley',
 'negocio',
 'econom칤a',
 'votar_favor',
 'crear',
 'contar',
 'nefasto',
 'multinacional',
 'megaminer칤a',
 'audiencia_p칰blico',
 'colombia',
 'apoyar',
 'amenazar',
 'informaci칩n',
 'tributario',
 'voto',
 'aprobaci칩n',
 'contrato',
 'onu',
 '3',
 'cesar',
 'departamento',
 'gobernar',
 'impacto',
 'apoyo',
 'promesa',
 'encima',
 'fracking',
 'nuevo',
 'foto',
 't칠cnico',
 'nacional',
 'piloto',
 'imp

# Stop Spark Context

In [54]:
spark.stop()

In [55]:
time.time()- t0

39.14262104034424