In [70]:
import pandas as pd
import numpy as np
import findspark
import glob
import os
import re
import nltk
import pickle
import spacy
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# Inicialización de SPARK

In [3]:
import findspark
localizacion_spark = '/opt/spark-2.4.5'
findspark.init(localizacion_spark)

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession

# numero de cores: 4, memoria ram que se le permite a spark usar: 7GB
spark_configurations = SparkConf()\
    .setMaster('local[4]')\
    .setAppName('Tarea_1')\
    .set("spark.driver.memory", "7g")

sc = pyspark.SparkContext(conf = spark_configurations)


spark = SparkSession\
    .builder\
    .master('local[4]')\
    .appName("Tarea_1") \
    .getOrCreate()

# Creación del índice

In [80]:
def documentReaderSpark(data_path, sparkContext):
    """
    Reads the documents using the RDD format of Spark. Each partition of the database 
    is a single document.
    :param data_path: path of the folder where all the documents are located
    :param sparkContext: object SparkContext() initialized
    :return: RDD of the documents
    """
    documents = sc\
        .wholeTextFiles(data_path,
                        minPartitions=None, 
                        use_unicode=True)\
        .map(lambda s: (re.search('<public publicId="(.*?)" uri="(.*?)" />',s[1]).group(1),
                        s[1].replace("\n","")\
                            .replace("\xa0"," "))
            )\
        .map(lambda s: (int(s[0].replace('d','')),re.search('<raw><!\[CDATA\[(.*?)\]\]></raw>',s[1]).group(1)))
    # print(documents.collect()[0])
    return documents

In [81]:
def tokenizationSpark(documents_rdd, use_spacy=False):
    """
    Tokenizes, removes stop words, normalizes and lemmatizes the documents
    :param documents_rdd:RDD of the documents
    :param use_spacy: Boolean used to specify the use of the package Spacy. By default uses
    nltk
    :return: RDD of term and corresponding document
    """
    if use_spacy:
        nlp_spacy_en = spacy.load('en_core_web_sm')
        nltk_lemmaList = documents_rdd\
            .map(lambda s : (s[0], nlp_spacy_en(s[1])))\
            .flatMap(lambda s : [(lemma,s[0]) for lemma in [token.lemma_ for token in s[1]]
                                 if nlp_spacy_en.vocab[lemma].is_stop == False
                                 and nlp_spacy_en.vocab[lemma].is_punct == False])\
            .map(lambda t : ((t[0], t[1]),1))
    else:
        nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
        p_stemmer = nltk.stem.porter.PorterStemmer()
        wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

        nltk_lemmaList = documents_rdd\
            .map(lambda s : (s[0], [word for word in nltk.word_tokenize(s[1]) 
                                    if word.isalnum()]))\
            .flatMap(lambda s : [(token,s[0]) for token in s[1] 
                                 if token not in nltk_stop_words_en])\
            .map(lambda s : ((wordnet_lemmatizer.lemmatize(s[0]), s[1]),1))
    
    # print(nltk_lemmaList.filter(lambda x : 223==x[0][1]).collect())
    return nltk_lemmaList


In [82]:
def makeInvertedIndexSpark(tokenized_documents_rdd):
    """
    Creates and saves the inverted index in pickle format. Additionally this implementation calculates
    term frecuency.
    :param tokenized_documents_rdd:RDD of the terms
    :return: Dictionary of the inverted index
    """
    inverted_index = tokenized_documents_rdd\
        .reduceByKey(lambda a, b : a+b )\
        .map(lambda s : (s[0][0], [[s[0][1],s[1]]]))\
        .reduceByKey(lambda a, b : sorted(a+b) )\
        .sortBy(lambda s : s[0])\
        .map(lambda s :  { s[0]: {'freq' : len(s[1]), 'posting':s[1]}})\
        .collect()
    respuesta = {}
    for item in inverted_index:
        respuesta.update(item)
    with open(os.path.join('docs','inverted_index.pkl'), 'wb') as handle:
        pickle.dump(respuesta, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return respuesta

In [None]:
# se crea el indice
documents_path = os.path.join('docs', 'docs-raw-texts')
documents = documentReaderSpark(documents_path, sc)
tokenized_docs = tokenizationSpark(documents)
inverted_index = makeInvertedIndexSpark(tokenized_docs)

In [78]:
# se termina la sesion de spark
sc.stop()