In [1]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import *

# Ingesta de datos de hdsf en dataframes

In [2]:
csv = '/user/jlondo97/datasets/articles1.csv'
df1 = spark.read.csv(csv,inferSchema=True,header=True)
csv = '/user/jlondo97/datasets/articles2.csv'
df2 = spark.read.csv(csv,inferSchema=True,header=True)
csv = '/user/jlondo97/datasets/articles3.csv'
df3 = spark.read.csv(csv,inferSchema=True,header=True)
# df1.show()
# df2.show()
# df3.show()

In [11]:
join_1_df_2 = df1.join(df2, on=['_c0', 'id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url', 'content'], how='left_outer')
full_df = join_1_df_2.join(df3, on=['_c0', 'id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url', 'content'], how='left_outer')
full_df = full_df.limit(410)
full_df.show()

+-----+-----+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
|  _c0|   id|               title|   publication|              author|      date|  year|month| url|             content|
+-----+-----+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
|10092|28828|Watch: Amazon Bos...|     Breitbart|         Nate Church|2017-03-21|2017.0|  3.0|null|At the MARS 2017 ...|
|10101|28837|Patriots Owner Ro...|     Breitbart|         Trent Baker|2017-02-03|2017.0|  2.0|null|”I remember who t...|
|10236|28972|Report: George So...|     Breitbart|         Aaron Klein|2017-01-23|2017.0|  1.0|null|Billionaire Georg...|
|10513|29249|Peter Schweizer: ...|     Breitbart|        John Hayward|2017-05-26|2017.0|  5.0|null|On Friday’s Breit...|
|10608|29344|Mexican Border St...|     Breitbart|   Cartel Chronicles|2017-02-19|2017.0|  2.0|null|PIEDRAS NEGRAS, C...|
|10646|29382|Maxine Waters: ’D..

# Limpieza del DataFrame
Creando un dataframe que contenga los contedidos de las publicaciones hechas y limpiando el contenido de caracteres especiales.

In [12]:
reg = '[^a-zA-Z ]'
reg1 = '[\s*]{1,}'
reg2 = '[\|’]'

In [13]:
full_df = full_df.withColumn("clean0", regexp_replace('Content', reg ,""))
full_df = full_df.withColumn("clean1", regexp_replace('clean0', reg1 ," "))
full_df = full_df.withColumn("clean2", regexp_replace('clean1', reg2, ""))
full_df = full_df.withColumn("ltrimmed_word",ltrim(col("clean2")))

# full_copy = full_df.withColumn("ltrimmed_word",ltrim(col("Content")))
# full_df = full_copy.withColumn("clean", regexp_replace('ltrimmed_word', reg ,""))
# full_df.select('ltrimmed_word','clean').show(100)

## Tokenización de los contenidos de las publicaciones
Creacion de un dataframe con el contenido de la publicacion tokenizado 

In [14]:
tokenization=Tokenizer(inputCol='ltrimmed_word',outputCol='tokens')

In [15]:
tokenized_df = tokenization.transform(full_df)

In [16]:
tokenized_df.select('Content', 'ltrimmed_word','tokens').show(410)

+--------------------+--------------------+--------------------+
|             Content|       ltrimmed_word|              tokens|
+--------------------+--------------------+--------------------+
|At the MARS 2017 ...|At the MARS tech ...|[at, the, mars, t...|
|”I remember who t...|I remember who th...|[i, remember, who...|
|Billionaire Georg...|Billionaire Georg...|[billionaire, geo...|
|On Friday’s Breit...|On Fridays Breitb...|[on, fridays, bre...|
|PIEDRAS NEGRAS, C...|PIEDRAS NEGRAS Co...|[piedras, negras,...|
|Saturday at a tow...|Saturday at a tow...|[saturday, at, a,...|
|WASHINGTON  —   P...|WASHINGTON Presid...|[washington, pres...|
|White House press...|White House press...|[white, house, pr...|
|Anthem, the natio...|Anthem the nation...|[anthem, the, nat...|
|Sunday on ABC’s “...|Sunday on ABCs Th...|[sunday, on, abcs...|
|Speaker Ryan’s pl...|Speaker Ryans pla...|[speaker, ryans, ...|
|Conservative fire...|Conservative fire...|[conservative, fi...|
|Sunday on NBC’s “...|Sun

## Eliminar stopWords
Eliminación de stopWord en la columna de contenido de las publicaciones, token tales como "I, and .or"

In [17]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [18]:
refined_df=stopword_removal.transform(tokenized_df)

In [20]:
refined_df.select('ltrimmed_word','tokens','refined_tokens').show(410)

+--------------------+--------------------+--------------------+
|       ltrimmed_word|              tokens|      refined_tokens|
+--------------------+--------------------+--------------------+
|At the MARS tech ...|[at, the, mars, t...|[mars, tech, conf...|
|I remember who th...|[i, remember, who...|[remember, people...|
|Billionaire Georg...|[billionaire, geo...|[billionaire, geo...|
|On Fridays Breitb...|[on, fridays, bre...|[fridays, breitba...|
|PIEDRAS NEGRAS Co...|[piedras, negras,...|[piedras, negras,...|
|Saturday at a tow...|[saturday, at, a,...|[saturday, town, ...|
|WASHINGTON Presid...|[washington, pres...|[washington, pres...|
|White House press...|[white, house, pr...|[white, house, pr...|
|Anthem the nation...|[anthem, the, nat...|[anthem, nations,...|
|Sunday on ABCs Th...|[sunday, on, abcs...|[sunday, abcs, we...|
|Speaker Ryans pla...|[speaker, ryans, ...|[speaker, ryans, ...|
|Conservative fire...|[conservative, fi...|[conservative, fi...|
|Sunday on NBCs Me...|[su

## Vectorización del DataFrame

In [21]:
refined_df

DataFrame[_c0: string, id: string, title: string, publication: string, author: string, date: string, year: string, month: string, url: string, content: string, clean0: string, clean1: string, clean2: string, ltrimmed_word: string, tokens: array<string>, refined_tokens: array<string>]

In [22]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import IntegerType

In [23]:
len_udf = udf(lambda s: len(s), IntegerType())
refined_df = refined_df.withColumn("token_count", len_udf(col('refined_tokens')))

In [24]:
refined_df.select('ltrimmed_word','tokens','refined_tokens','token_count').show(410)

+--------------------+--------------------+--------------------+-----------+
|       ltrimmed_word|              tokens|      refined_tokens|token_count|
+--------------------+--------------------+--------------------+-----------+
|At the MARS tech ...|[at, the, mars, t...|[mars, tech, conf...|        233|
|I remember who th...|[i, remember, who...|[remember, people...|         61|
|Billionaire Georg...|[billionaire, geo...|[billionaire, geo...|        392|
|On Fridays Breitb...|[on, fridays, bre...|[fridays, breitba...|        551|
|PIEDRAS NEGRAS Co...|[piedras, negras,...|[piedras, negras,...|        192|
|Saturday at a tow...|[saturday, at, a,...|[saturday, town, ...|         51|
|WASHINGTON Presid...|[washington, pres...|[washington, pres...|        905|
|White House press...|[white, house, pr...|[white, house, pr...|        175|
|Anthem the nation...|[anthem, the, nat...|[anthem, nations,...|        174|
|Sunday on ABCs Th...|[sunday, on, abcs...|[sunday, abcs, we...|        106|

## Agrupación de textos

In [25]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF
from pyspark.ml.clustering import LDA, KMeans

In [26]:
aux_df = refined_df
aux_tf_idf = refined_df
aux_df = aux_df.drop('publication', 'author', 'publication', 'title', 'date', 'year', 'month', 'url', 'clean', 'clean0', 'clean1', 'clean2', 'tokens')
aux_df.show(410)

+-----+-----+--------------------+--------------------+--------------------+-----------+
|  _c0|   id|             content|       ltrimmed_word|      refined_tokens|token_count|
+-----+-----+--------------------+--------------------+--------------------+-----------+
|10092|28828|At the MARS 2017 ...|At the MARS tech ...|[mars, tech, conf...|        233|
|10101|28837|”I remember who t...|I remember who th...|[remember, people...|         61|
|10236|28972|Billionaire Georg...|Billionaire Georg...|[billionaire, geo...|        392|
|10513|29249|On Friday’s Breit...|On Fridays Breitb...|[fridays, breitba...|        551|
|10608|29344|PIEDRAS NEGRAS, C...|PIEDRAS NEGRAS Co...|[piedras, negras,...|        192|
|10646|29382|Saturday at a tow...|Saturday at a tow...|[saturday, town, ...|         51|
|  109|17410|WASHINGTON  —   P...|WASHINGTON Presid...|[washington, pres...|        905|
|11015|29751|White House press...|White House press...|[white, house, pr...|        175|
|11085|29821|Anthem, 

In [27]:
fill = array().cast("array<string>")
tokens_a = when(col("refined_tokens").isNull(), fill).otherwise(col("refined_tokens"))
aux_df = aux_df.withColumn("refined_tokens", tokens_a)

In [31]:
cv = CountVectorizer(inputCol="refined_tokens", outputCol="rawFeatures")
cvmodel = cv.fit(aux_df)
featurizedData = cvmodel.transform(aux_df)
vocab = cvmodel.vocabulary
vocab
# vocab_broadcast = sc.broadcast(vocab)
# idf = IDF(inputCol="rawFeatures", outputCol="features")
# idfModel = idf.fit(featurizedData)
# rescaledData = idfModel.transform(featurizedData)
# rescaledData.select("refined_tokens", "features").show(100)

['said',
 'trump',
 'mr',
 'one',
 'people',
 'president',
 'new',
 'also',
 'clinton',
 'like',
 'told',
 'news',
 'time',
 'states',
 'state',
 'twitter',
 'two',
 'police',
 'us',
 'years',
 'first',
 'many',
 'trumps',
 'according',
 'even',
 'last',
 'country',
 'american',
 'campaign',
 'house',
 'u',
 'breitbart',
 'say',
 'white',
 'think',
 'election',
 'year',
 'make',
 'united',
 'donald',
 'law',
 'get',
 'may',
 'obama',
 'national',
 'cnn',
 'says',
 'government',
 'know',
 'made',
 'going',
 'since',
 'political',
 'public',
 'hillary',
 'dont',
 'want',
 'back',
 'media',
 'former',
 'day',
 'republican',
 'well',
 'take',
 'world',
 'way',
 'security',
 'china',
 'september',
 'york',
 'called',
 'women',
 'go',
 'court',
 'percent',
 'follow',
 'ms',
 'city',
 'including',
 'party',
 'three',
 'still',
 'officials',
 'presidential',
 'much',
 'long',
 'part',
 'another',
 'right',
 'week',
 'around',
 'show',
 'reported',
 'group',
 'times',
 'administration',
 'never

In [32]:
hashingTF = HashingTF(inputCol="refined_tokens", outputCol="rawFeatures")
tf_df = hashingTF.transform(aux_tf_idf)
tf_df.select(['refined_tokens', 'rawFeatures']).show(410)
# kmeans = KMeans(k=25)
# lda = LDA(k=25, seed=123, optimizer="em", featuresCol="features")
# pipeline = Pipeline(stages=[hashingTF, idf])
# pipeline = Pipeline(stages=[hashingTF, idf, kmeans, lda])
# model = pipeline.fit(aux_df)
# results = model.transform(aux_df)
# results.cache()
# results.withColumn("aux", print_columns("refined_tokens")) # .select("aux").show()
# results.filter("prediction = 13").select('refined_tokens', 'features', 'prediction').show(100)

+--------------------+--------------------+
|      refined_tokens|         rawFeatures|
+--------------------+--------------------+
|[rayshell, byers,...|(262144,[619,813,...|
|[sunday, cbss, fa...|(262144,[3924,678...|
|[president, donal...|(262144,[5381,576...|
|[sunday, cbss, fa...|(262144,[6369,761...|
|[thursday, editio...|(262144,[4954,761...|
|[never, trump, mo...|(262144,[14,619,1...|
|[man, memphis, te...|(262144,[619,1727...|
|[know, going, die...|(262144,[1727,184...|
|[georgia, lawmake...|(262144,[513,2548...|
|[pres, trump, tap...|(262144,[6781,761...|
|[according, polit...|(262144,[440,4872...|
|[report, week, cn...|(262144,[1846,232...|
|[death, american,...|(262144,[1846,243...|
|[mother, american...|(262144,[836,994,...|
|[years, old, yet,...|(262144,[571,991,...|
|[times, israel, r...|(262144,[8053,113...|
|[exit, memorandum...|(262144,[632,1769...|
|[hot, topic, spor...|(262144,[3326,410...|
|[fridays, broadca...|(262144,[1156,232...|
|[barack, obama, b...|(262144,[3

In [33]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
ft_idf = idf.fit(tf_df).transform(tf_df)
ft_idf.select(['refined_tokens', 'features']).show(410)

+--------------------+--------------------+
|      refined_tokens|            features|
+--------------------+--------------------+
|[rayshell, byers,...|(262144,[619,813,...|
|[sunday, cbss, fa...|(262144,[3924,678...|
|[president, donal...|(262144,[5381,576...|
|[sunday, cbss, fa...|(262144,[6369,761...|
|[thursday, editio...|(262144,[4954,761...|
|[never, trump, mo...|(262144,[14,619,1...|
|[man, memphis, te...|(262144,[619,1727...|
|[know, going, die...|(262144,[1727,184...|
|[georgia, lawmake...|(262144,[513,2548...|
|[pres, trump, tap...|(262144,[6781,761...|
|[according, polit...|(262144,[440,4872...|
|[report, week, cn...|(262144,[1846,232...|
|[death, american,...|(262144,[1846,243...|
|[mother, american...|(262144,[836,994,...|
|[years, old, yet,...|(262144,[571,991,...|
|[times, israel, r...|(262144,[8053,113...|
|[exit, memorandum...|(262144,[632,1769...|
|[hot, topic, spor...|(262144,[3326,410...|
|[fridays, broadca...|(262144,[1156,232...|
|[barack, obama, b...|(262144,[3

In [None]:
# results.select('refined_tokens', 'features', 'prediction').show(410)

In [42]:
kmeans = KMeans(k=25)
kmeans_model = kmeans.fit(ft_idf).transform(ft_idf)
kmeans_model.select('refined_tokens', 'features', 'prediction').show(410)
# lda = LDA(k=25, seed=123, optimizer="em", featuresCol="features")
# pipeline = Pipeline(stages=[hashingTF, idf])
# pipeline = Pipeline(stages=[hashingTF, idf, kmeans, lda])
# model = pipeline.fit(aux_df)
# results = model.transform(aux_df)

+--------------------+--------------------+----------+
|      refined_tokens|            features|prediction|
+--------------------+--------------------+----------+
|[mars, tech, conf...|(262144,[4054,487...|        13|
|[remember, people...|(262144,[408,6051...|        13|
|[billionaire, geo...|(262144,[1973,232...|        13|
|[fridays, breitba...|(262144,[14,619,6...|        13|
|[piedras, negras,...|(262144,[2281,271...|        13|
|[saturday, town, ...|(262144,[7612,102...|        13|
|[washington, pres...|(262144,[619,646,...|        13|
|[white, house, pr...|(262144,[7612,102...|        13|
|[anthem, nations,...|(262144,[2801,606...|        13|
|[sunday, abcs, we...|(262144,[1667,410...|        13|
|[speaker, ryans, ...|(262144,[4871,991...|        13|
|[conservative, fi...|(262144,[3834,910...|        13|
|[sunday, nbcs, me...|(262144,[6781,761...|        13|
|[tel, aviv, jewis...|(262144,[1076,197...|        13|
|[although, nra, a...|(262144,[622,4200...|        13|
|[friday, 

In [43]:
# ldamodel.isDistributed()
# ldamodel.vocabSize()
lda = LDA(k=25, seed=100, optimizer="em", featuresCol="features")
lda_model = lda.fit(kmeans_model).transform(kmeans_model)
lda_model.select('refined_tokens', 'features', 'prediction', 'topicDistribution').show(410)

+--------------------+--------------------+----------+--------------------+
|      refined_tokens|            features|prediction|   topicDistribution|
+--------------------+--------------------+----------+--------------------+
|[rayshell, byers,...|(262144,[619,813,...|        13|[0.23323193011050...|
|[sunday, cbss, fa...|(262144,[3924,678...|        13|[0.02174397934324...|
|[president, donal...|(262144,[5381,576...|        13|[0.03734683155343...|
|[sunday, cbss, fa...|(262144,[6369,761...|        13|[0.02451597377683...|
|[thursday, editio...|(262144,[4954,761...|        13|[0.01685743473220...|
|[never, trump, mo...|(262144,[14,619,1...|        13|[0.05338721118759...|
|[man, memphis, te...|(262144,[619,1727...|        13|[0.13725612397587...|
|[know, going, die...|(262144,[1727,184...|        13|[0.07805902519548...|
|[georgia, lawmake...|(262144,[513,2548...|        13|[0.77088039618029...|
|[pres, trump, tap...|(262144,[6781,761...|        13|[0.06082702000824...|
|[according,

In [44]:
lda_model.groupBy('prediction').count().show(410)

+----------+-----+
|prediction|count|
+----------+-----+
|        13|  392|
|         5|    3|
|         4|    2|
|         8|    5|
|        14|    1|
|         0|    2|
|         6|    1|
|        18|    1|
|        21|    1|
|         3|    1|
|        15|    1|
+----------+-----+



In [None]:
# ldatopics = lda_model.describeTopics()
# ldatopics.show(410)

In [None]:
# results.select('refined_tokens', 'features', 'prediction').show(100, truncate = True) #.count()