In [1]:
import pandas as pd
import numpy as np
import findspark
import glob
import os
import re
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [2]:
import spacy

# Inicialización de SPARK

In [3]:
import findspark
localizacion_spark = '/opt/spark-2.4.5'
findspark.init(localizacion_spark)

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession

# numero de cores: 4, memoria ram que se le permite a spark usar: 7GB
spark_configurations = SparkConf()\
    .setMaster('local[4]')\
    .setAppName('Tarea_1')\
    .set("spark.driver.memory", "7g")

sc = pyspark.SparkContext(conf = spark_configurations)


spark = SparkSession\
    .builder\
    .master('local[4]')\
    .appName("Tarea_1") \
    .getOrCreate()

# Lectura de datos

In [5]:
def documentReaderSpark(data_path:str, sparkContext):
    """
    DocString
    :return: Nothing
    """
    documents = sc\
        .wholeTextFiles(data_path,
                        minPartitions=None, 
                        use_unicode=True)\
        .map(lambda s: (re.search('<public publicId="(.*?)" uri="(.*?)" />',s[1]).group(1),
                        s[1].replace("\n","")\
                            .replace("\xa0"," "))
            )\
        .map(lambda s: (s[0],re.search('<raw><!\[CDATA\[(.*?)\]\]></raw>',s[1]).group(1)))
    print(documents.collect()[0])
    return documents
documents_path = os.path.join('docs', 'docs-raw-texts')
documents = documentReaderSpark(documents_path, sc)

('d223', 'Well, I Didn’t Know it was Hard – Happy Birthday Ivan Sutherland.Ivan Sutherland’s Sketchpad (1963) Happy Birthday 74th Ivan Sutherland! The American computer scientist and Internet pioneer has received the Turing Award from the Association for Computing Machinery in 1988 for his invention of Sketchpad, an early predecessor to the sort of graphical user interface that has become ubiquitous in personal computers today. Sketchpad could accept constraints and specified relationships among segments and arcs, including the diameter of arcs. It could draw both horizontal and vertical lines and combine them into figures and shapes. Figures could be copied, moved, rotated, or resized, retaining their basic properties. Sketchpad also had the first window-drawing program and clipping algorithm, which allowed zooming. When asked, “How could you possibly have done the first interactive graphics program, the first non-procedural programming language, the first object oriented software sys

In [6]:
def tokenizationSpark(documents_rdd, use_spacy=False):
    """
    :param documentos:
    :return:
    """
    if use_spacy:
        nlp_spacy_en = spacy.load('en_core_web_sm')
        nltk_lemmaList = documents_rdd\
            .map(lambda s : (s[0], nlp_spacy_en(s[1])))\
            .flatMap(lambda s : [(lemma,s[0]) for lemma in [token.lemma_ for token in s[1]]
                                 if nlp_spacy_en.vocab[lemma].is_stop == False
                                 and nlp_spacy_en.vocab[lemma].is_punct == False])\
            .map(lambda t : ((t[0], t[1]),1))
    else:
        nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
        p_stemmer = nltk.stem.porter.PorterStemmer()
        wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

        nltk_lemmaList = documents_rdd\
            .map(lambda s : (s[0], [word for word in nltk.word_tokenize(s[1]) 
                                    if word.isalnum()]))\
            .flatMap(lambda s : [(token,s[0]) for token in s[1] 
                                 if token not in nltk_stop_words_en])\
            .map(lambda s : ((wordnet_lemmatizer.lemmatize(s[0]), s[1]),1))
    
    print(nltk_lemmaList.filter(lambda x : 'd223'==x[0][1]).collect())
    return nltk_lemmaList
tokenized_docs = tokenizationSpark(documents)


[(('Well', 'd223'), 1), (('I', 'd223'), 1), (('Didn', 'd223'), 1), (('Know', 'd223'), 1), (('Hard', 'd223'), 1), (('Happy', 'd223'), 1), (('Birthday', 'd223'), 1), (('Ivan', 'd223'), 1), (('Sutherland', 'd223'), 1), (('Sketchpad', 'd223'), 1), (('1963', 'd223'), 1), (('Happy', 'd223'), 1), (('Birthday', 'd223'), 1), (('74th', 'd223'), 1), (('Ivan', 'd223'), 1), (('Sutherland', 'd223'), 1), (('The', 'd223'), 1), (('American', 'd223'), 1), (('computer', 'd223'), 1), (('scientist', 'd223'), 1), (('Internet', 'd223'), 1), (('pioneer', 'd223'), 1), (('received', 'd223'), 1), (('Turing', 'd223'), 1), (('Award', 'd223'), 1), (('Association', 'd223'), 1), (('Computing', 'd223'), 1), (('Machinery', 'd223'), 1), (('1988', 'd223'), 1), (('invention', 'd223'), 1), (('Sketchpad', 'd223'), 1), (('early', 'd223'), 1), (('predecessor', 'd223'), 1), (('sort', 'd223'), 1), (('graphical', 'd223'), 1), (('user', 'd223'), 1), (('interface', 'd223'), 1), (('become', 'd223'), 1), (('ubiquitous', 'd223'), 1),

In [7]:
def makeInvertedIndexSpark(tokenized_documents_rdd):
    """
    :param documentos:
    :return:
    """
    inverted_index = tokenized_documents_rdd\
        .reduceByKey(lambda a, b : a+b )\
        .map(lambda s : (s[0][0], [[s[0][1],s[1]]]))\
        .reduceByKey(lambda a, b : sorted(a+b) )
    print(inverted_index.filter(lambda x : 'program'==x[0]).collect())
    return inverted_index
inverted_index = makeInvertedIndexSpark(tokenized_docs)

[('program', [['d020', 1], ['d048', 1], ['d071', 1], ['d089', 1], ['d110', 1], ['d123', 1], ['d124', 1], ['d142', 1], ['d182', 2], ['d193', 1], ['d194', 3], ['d195', 4], ['d198', 1], ['d199', 1], ['d200', 1], ['d201', 12], ['d203', 1], ['d206', 1], ['d207', 2], ['d208', 4], ['d209', 1], ['d210', 4], ['d214', 1], ['d215', 1], ['d219', 1], ['d223', 2], ['d241', 1], ['d249', 1], ['d251', 1], ['d253', 1], ['d266', 1], ['d276', 1], ['d280', 1], ['d301', 1], ['d303', 1], ['d304', 1], ['d315', 1], ['d319', 1], ['d322', 1]])]


In [8]:
# se termina la sesion de spark
sc.stop()