In [1]:
#!wget https://kdd.ics.uci.edu/databases/20newsgroups/20_newsgroups.tar.gz -o 20_newsgroups.tar.gz

#!tar -xvf 20_newsgroups.tar.gz.1

#!pip install nltk

#!pip install gensim

#!pip install scikit-learn

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, udf,rand
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import IntegerType
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.linalg.distributed import RowMatrix
import numpy as np
import pandas as pd
import findspark
import os
import re

from nltk.stem import WordNetLemmatizer
from gensim.utils import tokenize 
import nltk

findspark.init()



In [3]:
spark = SparkSession.builder.appName("lda").master("local[*]").config("spark.driver.maxResultSize", "4g").getOrCreate()
sc= spark.sparkContext
wnl = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('punkt')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/21 14:17:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Procesamiento de los datos

In [4]:

# Definir funciones de Transformacion

def preprocess_file(file_path):
    """
    Preprocesa un archivo de texto eliminando caracteres no alfanuméricos,
    saltos de línea, espacios dobles y tabulaciones. Extrae el contenido
    a partir de la línea siguiente a la primera ocurrencia de "Lines:" hasta
    el final del archivo.
    """
    with open(file_path, encoding="latin-1") as f:
        lines = f.readlines()
        s = 0
        for i, line in enumerate(lines):
            if "Lines:" in line:
                if s == 0:
                    s = i
        content = "".join([re.sub('[^a-zA-Z0-9.,\s]+', "", x.replace("\n", ".").replace("  ", ".").replace("\t","").replace("..","")) for x in lines[s + 4:]])
        return content

def feature_transformation(x):
    """
    Realiza la transformación de características en un texto dado.
    Tokeniza el texto, convierte las palabras a minúsculas, elimina las palabras
    con una longitud menor o igual a 3, lematiza las palabras y elimina aquellas
    que están en la lista de palabras vacías (Stopwords).
    """
    words = list(tokenize(str(x)))
    words = [word.lower() for word in words if len(word) > 3]
    words = [wnl.lemmatize(word) for word in words if word not in Stopwords]
    return words

# Cargar Stopwords

with open("Stopwords") as file:
    Stopwords = file.read().split("\n")


directory_path = "./20_newsgroups/"
#directory_path = "./20_newsgroups/20_newsgroups"
folders = os.listdir(directory_path)

dct_rdd = sc \
    .parallelize(folders) \
    .map(
        lambda folder: (
            folder, 
            [
                preprocess_file(os.path.join(directory_path, folder, file)) for file in os.listdir(os.path.join(directory_path, folder))[:100]
            ]
        )
    ) \
    .flatMap(lambda row: [(row[0], item) for item in row[1]]) \
    .map(lambda x : [x[0],x[1]])  \
    .map(lambda x : (x[0], feature_transformation(x)))

df = dct_rdd.toDF(["id","docs"])

random_sample = df.orderBy(rand()).limit(5)
random_sample.show()

                                                                                

+--------------------+--------------------+
|                  id|                docs|
+--------------------+--------------------+
|comp.os.ms-window...|[comp, window, mi...|
|  rec.sport.baseball|[sport, baseball,...|
|comp.os.ms-window...|[comp, window, mi...|
|           sci.crypt|[crypt, message, ...|
|comp.os.ms-window...|[comp, window, mi...|
+--------------------+--------------------+



# Vectorización

In [5]:

# Vectorizacion del texto

cv = CountVectorizer(inputCol = "docs", outputCol = "features")
count_vectorizer_model = cv.fit(df)
result = count_vectorizer_model.transform(df)
corpus = result.select("id", "features")

                                                                                

```
def python_func():
    dct = {}
    for folder in os.listdir("./20_newsgroups/20_newsgroups"):
        docs = []
        for file in os.listdir(f"./20_newsgroups/20_newsgroups/{folder}")[0:100]:
            with open(f"./20_newsgroups/20_newsgroups/{folder}/{file}", encoding = "latin-1") as f :
                doc = f.readlines()
                s = 0
                for i,j in zip(doc,range(len(doc))):
                    if "Lines:" in i:
                        if s == 0:
                            s= j
                doc = "".join([re.sub('[^a-zA-Z0-9.,\s]+', "", x.replace("\n", ".").lower().replace("  ", ".")) for x in doc[s+4:]])

            docs.append(doc)
        dct[folder] = docs
```

# LDA

In [6]:

# Dividir en set the entrenamiento y de prueba
corpus_train, corpus_test = corpus.randomSplit([0.7,0.3], seed=4000)

# Entrenar modelo LDA
lda_model = LDA(k= 10,seed=7,optimizer = "em", maxIter=10)
fitted_model = lda_model.fit(corpus_train)
topics = fitted_model.describeTopics()

                                                                                

In [7]:
# Probar el modelo ------ Encontrar el topico dominante 

fila_prueba = [corpus_test.select("features").first()]
document_topics = fitted_model.transform(spark.createDataFrame(fila_prueba))

def extract_dominant_topic(topic_distribution):
    return int(np.argmax(topic_distribution))
extract_dominant_topic_udf = udf(extract_dominant_topic, IntegerType())

df_with_dominant_topic = document_topics.withColumn(
    "Topico Dominante",
    extract_dominant_topic_udf("topicDistribution")
)


df_with_dominant_topic.show()

24/05/21 14:18:30 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/05/21 14:18:31 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/05/21 14:18:31 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/21 14:18:31 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


+--------------------+--------------------+----------------+
|            features|   topicDistribution|Topico Dominante|
+--------------------+--------------------+----------------+
|(41392,[0,1,2,3,4...|[0.11984254718708...|               7|
+--------------------+--------------------+----------------+



# Regresion Logistica Multimodal

In [8]:
# Corregir Df para que se ajuste al modelo

indexer = StringIndexer(inputCol="id", outputCol="label")
indexer_model = indexer.fit(corpus_train)
corpus_train_lr = indexer_model.transform(corpus_train)
corpus_test_lr = indexer_model.transform(corpus_test)

corpus_train_lr = corpus_train_lr.select(
    [
        "label",
        "features"
    ]
)
lr = LogisticRegression(
    maxIter=4, 
    regParam=0.3, 
    elasticNetParam=0.8
)
lr_multi_model = lr.fit(corpus_train_lr)
transformed_lr_data = lr_multi_model.transform(corpus_train_lr)

                                                                                

In [9]:
print("Coefficients: \n" + str(lr_multi_model.coefficientMatrix))
print("Intercept: " + str(lr_multi_model.interceptVector))
print("----------------------------------------------------------------------------------------------------------------")
trainingSummary = lr_multi_model.summary
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))


Coefficients: 
20 X 41392 CSRMatrix

Intercept: [0.11069798372035645,0.09797498783711699,0.0850879797438702,0.07203269374457213,0.07203269374459258,0.058804694513050794,0.03181191479130732,0.03181191479130724,0.031811914791311006,0.0040704242180268635,-0.010094242940290474,-0.010094242940310218,-0.0100942429403331,-0.024462321584411324,-0.039039710768368936,-0.05383256700729993,-0.08409068400014351,-0.08409068400012712,-0.09956968406646577,-0.18076882164776137]
----------------------------------------------------------------------------------------------------------------


[Stage 321:>                                                        (0 + 2) / 2]

Accuracy: 0.05571227080394922
FPR: 0.05571227080394922
TPR: 0.05571227080394922
F-measure: 0.005880119430209736
Precision: 0.003103857118132573
Recall: 0.05571227080394922


                                                                                