In [1]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import *

# Ingesta de datos de hdsf en dataframes

In [2]:
csv = '/user/jlondo97/datasets/articles1.csv'
df1 = spark.read.csv(csv,inferSchema=True,header=True)
csv = '/user/jlondo97/datasets/articles2.csv'
df2 = spark.read.csv(csv,inferSchema=True,header=True)
csv = '/user/jlondo97/datasets/articles3.csv'
df3 = spark.read.csv(csv,inferSchema=True,header=True)

In [3]:
join_1_df_2 = df1.join(df2, on=['_c0', 'id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url', 'content'], how='left_outer')
full_df = join_1_df_2.join(df3, on=['_c0', 'id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url', 'content'], how='left_outer')
full_df = full_df.limit(400)
full_df.show()

+-----+-----+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
|  _c0|   id|               title|   publication|              author|      date|  year|month| url|             content|
+-----+-----+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
|10092|28828|Watch: Amazon Bos...|     Breitbart|         Nate Church|2017-03-21|2017.0|  3.0|null|At the MARS 2017 ...|
|10101|28837|Patriots Owner Ro...|     Breitbart|         Trent Baker|2017-02-03|2017.0|  2.0|null|”I remember who t...|
|10236|28972|Report: George So...|     Breitbart|         Aaron Klein|2017-01-23|2017.0|  1.0|null|Billionaire Georg...|
|10513|29249|Peter Schweizer: ...|     Breitbart|        John Hayward|2017-05-26|2017.0|  5.0|null|On Friday’s Breit...|
|10608|29344|Mexican Border St...|     Breitbart|   Cartel Chronicles|2017-02-19|2017.0|  2.0|null|PIEDRAS NEGRAS, C...|
|10646|29382|Maxine Waters: ’D..

# Limpieza del DataFrame
Creando un dataframe que contenga los contedidos de las publicaciones hechas y limpiando el contenido de caracteres especiales.

In [4]:
reg = '[^a-zA-Z ]'
reg1 = '[\s*]{1,}'
reg2 = '[\|’]'

In [5]:
full_df = full_df.withColumn("clean0", regexp_replace('Content', reg ,""))
full_df = full_df.withColumn("clean1", regexp_replace('clean0', reg1 ," "))
full_df = full_df.withColumn("clean2", regexp_replace('clean1', reg2, ""))
full_df = full_df.withColumn("ltrimmed_word",ltrim(col("clean2")))
full_df.select('Content', 'ltrimmed_word').show(440)

+--------------------+--------------------+
|             Content|       ltrimmed_word|
+--------------------+--------------------+
|At the MARS 2017 ...|At the MARS tech ...|
|”I remember who t...|I remember who th...|
|Billionaire Georg...|Billionaire Georg...|
|On Friday’s Breit...|On Fridays Breitb...|
|PIEDRAS NEGRAS, C...|PIEDRAS NEGRAS Co...|
|Saturday at a tow...|Saturday at a tow...|
|WASHINGTON  —   P...|WASHINGTON Presid...|
|White House press...|White House press...|
|Anthem, the natio...|Anthem the nation...|
|Sunday on ABC’s “...|Sunday on ABCs Th...|
|Speaker Ryan’s pl...|Speaker Ryans pla...|
|Conservative fire...|Conservative fire...|
|Sunday on NBC’s “...|Sunday on NBCs Me...|
|TEL AVIV  —   A J...|TEL AVIV A Jewish...|
|Although the NRA ...|Although the NRA ...|
|Friday on Hugh He...|Friday on Hugh He...|
|While the protest...|While the protest...|
|At a Tuesday pres...|At a Tuesday pres...|
|Players that prot...|Players that prot...|
|  magazine Teen V...|magazine T

## Tokenización de los contenidos de las publicaciones
Creacion de un dataframe con el contenido de la publicacion tokenizado 

In [6]:
tokenization=Tokenizer(inputCol='ltrimmed_word',outputCol='tokens')

In [7]:
tokenized_df = tokenization.transform(full_df)

In [8]:
tokenized_df.select('Content', 'ltrimmed_word','tokens').show()

+--------------------+--------------------+--------------------+
|             Content|       ltrimmed_word|              tokens|
+--------------------+--------------------+--------------------+
|At the MARS 2017 ...|At the MARS tech ...|[at, the, mars, t...|
|”I remember who t...|I remember who th...|[i, remember, who...|
|Billionaire Georg...|Billionaire Georg...|[billionaire, geo...|
|On Friday’s Breit...|On Fridays Breitb...|[on, fridays, bre...|
|PIEDRAS NEGRAS, C...|PIEDRAS NEGRAS Co...|[piedras, negras,...|
|Saturday at a tow...|Saturday at a tow...|[saturday, at, a,...|
|WASHINGTON  —   P...|WASHINGTON Presid...|[washington, pres...|
|White House press...|White House press...|[white, house, pr...|
|Anthem, the natio...|Anthem the nation...|[anthem, the, nat...|
|Sunday on ABC’s “...|Sunday on ABCs Th...|[sunday, on, abcs...|
|Speaker Ryan’s pl...|Speaker Ryans pla...|[speaker, ryans, ...|
|Conservative fire...|Conservative fire...|[conservative, fi...|
|Sunday on NBC’s “...|Sun

## Eliminar stopWords
Eliminación de stopWord en la columna de contenido de las publicaciones, token tales como "I, and .or"

In [9]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [10]:
refined_df=stopword_removal.transform(tokenized_df)

In [11]:
refined_df.select('ltrimmed_word','tokens','refined_tokens').show()

+--------------------+--------------------+--------------------+
|       ltrimmed_word|              tokens|      refined_tokens|
+--------------------+--------------------+--------------------+
|In Rayshell Byers...|[in, rayshell, by...|[rayshell, byers,...|
|Sunday on CBSs Fa...|[sunday, on, cbss...|[sunday, cbss, fa...|
|President Donald ...|[president, donal...|[president, donal...|
|Sunday on CBSs Fa...|[sunday, on, cbss...|[sunday, cbss, fa...|
|On the Thursday e...|[on, the, thursda...|[thursday, editio...|
|A Never Trump mov...|[a, never, trump,...|[never, trump, mo...|
|A man from Memphi...|[a, man, from, me...|[man, memphis, te...|
|Do you know you a...|[do, you, know, y...|[know, going, die...|
|Georgia lawmakers...|[georgia, lawmake...|[georgia, lawmake...|
|Pres Trump on if ...|[pres, trump, on,...|[pres, trump, tap...|
|According to a Po...|[according, to, a...|[according, polit...|
|In a report this ...|[in, a, report, t...|[report, week, cn...|
|The death of the ...|[th

## Vectorización del DataFrame

In [12]:
refined_df

DataFrame[_c0: string, id: string, title: string, publication: string, author: string, date: string, year: string, month: string, url: string, content: string, clean0: string, clean1: string, clean2: string, ltrimmed_word: string, tokens: array<string>, refined_tokens: array<string>]

In [13]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import IntegerType

In [14]:
len_udf = udf(lambda s: len(s), IntegerType())
refined_df = refined_df.withColumn("token_count", len_udf(col('refined_tokens')))

In [15]:
refined_df.select('ltrimmed_word','tokens','refined_tokens','token_count').show()

+--------------------+--------------------+--------------------+-----------+
|       ltrimmed_word|              tokens|      refined_tokens|token_count|
+--------------------+--------------------+--------------------+-----------+
|In Rayshell Byers...|[in, rayshell, by...|[rayshell, byers,...|        375|
|Sunday on CBSs Fa...|[sunday, on, cbss...|[sunday, cbss, fa...|         96|
|President Donald ...|[president, donal...|[president, donal...|         63|
|Sunday on CBSs Fa...|[sunday, on, cbss...|[sunday, cbss, fa...|        101|
|On the Thursday e...|[on, the, thursda...|[thursday, editio...|        128|
|A Never Trump mov...|[a, never, trump,...|[never, trump, mo...|        714|
|A man from Memphi...|[a, man, from, me...|[man, memphis, te...|        202|
|Do you know you a...|[do, you, know, y...|[know, going, die...|        470|
|Georgia lawmakers...|[georgia, lawmake...|[georgia, lawmake...|        156|
|Pres Trump on if ...|[pres, trump, on,...|[pres, trump, tap...|        121|

## Agrupación de textos

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF
from pyspark.ml.clustering import LDA, KMeans

In [17]:
aux_df = refined_df
aux_tf_idf = refined_df
aux_df = aux_df.drop('publication', 'author', 'publication', 'title', 'date', 'year', 'month', 'url', 'clean', 'clean0', 'clean1', 'clean2', 'tokens')
aux_df.show()

+-----+-----+--------------------+--------------------+--------------------+-----------+
|  _c0|   id|             content|       ltrimmed_word|      refined_tokens|token_count|
+-----+-----+--------------------+--------------------+--------------------+-----------+
|10092|28828|At the MARS 2017 ...|At the MARS tech ...|[mars, tech, conf...|        233|
|10101|28837|”I remember who t...|I remember who th...|[remember, people...|         61|
|10236|28972|Billionaire Georg...|Billionaire Georg...|[billionaire, geo...|        392|
|10513|29249|On Friday’s Breit...|On Fridays Breitb...|[fridays, breitba...|        551|
|10608|29344|PIEDRAS NEGRAS, C...|PIEDRAS NEGRAS Co...|[piedras, negras,...|        192|
|10646|29382|Saturday at a tow...|Saturday at a tow...|[saturday, town, ...|         51|
|  109|17410|WASHINGTON  —   P...|WASHINGTON Presid...|[washington, pres...|        905|
|11015|29751|White House press...|White House press...|[white, house, pr...|        175|
|11085|29821|Anthem, 

In [18]:
fill = array().cast("array<string>")
tokens_a = when(col("refined_tokens").isNull(), fill).otherwise(col("refined_tokens"))
aux_df = aux_df.withColumn("refined_tokens", tokens_a)

In [19]:
cv = CountVectorizer(inputCol="refined_tokens", outputCol="rawFeatures")
cvmodel = cv.fit(aux_df)
featurizedData = cvmodel.transform(aux_df)
vocab = cvmodel.vocabulary
vocab

['said',
 'trump',
 'people',
 'one',
 'new',
 'president',
 'also',
 'us',
 'like',
 'mr',
 'news',
 'state',
 'two',
 'time',
 'told',
 'twitter',
 'police',
 'first',
 'dont',
 'states',
 'going',
 'years',
 'think',
 'last',
 'clinton',
 'house',
 'campaign',
 'donald',
 'get',
 'many',
 'cnn',
 'united',
 'according',
 'trumps',
 'even',
 'know',
 'white',
 'may',
 'year',
 'country',
 'made',
 'way',
 'u',
 'media',
 'american',
 'national',
 'make',
 'government',
 'since',
 'back',
 'percent',
 'officials',
 'say',
 'well',
 'world',
 'group',
 'former',
 'obama',
 'july',
 'security',
 'see',
 'including',
 'want',
 'breitbart',
 'black',
 'thats',
 'report',
 'im',
 'says',
 'much',
 'still',
 'week',
 'called',
 'times',
 'law',
 'support',
 'day',
 'three',
 'statement',
 'right',
 'party',
 'take',
 'political',
 'saying',
 'follow',
 'health',
 'public',
 'go',
 'good',
 'york',
 'another',
 'part',
 'election',
 'north',
 'work',
 'never',
 'presidential',
 'reported',
 

In [20]:
hashingTF = HashingTF(inputCol="refined_tokens", outputCol="rawFeatures")
tf_df = hashingTF.transform(aux_tf_idf)
tf_df.select(['refined_tokens', 'rawFeatures']).show()

+--------------------+--------------------+
|      refined_tokens|         rawFeatures|
+--------------------+--------------------+
|[mars, tech, conf...|(262144,[4054,487...|
|[remember, people...|(262144,[408,6051...|
|[billionaire, geo...|(262144,[1973,232...|
|[fridays, breitba...|(262144,[14,619,6...|
|[piedras, negras,...|(262144,[2281,271...|
|[saturday, town, ...|(262144,[7612,102...|
|[washington, pres...|(262144,[619,646,...|
|[white, house, pr...|(262144,[7612,102...|
|[anthem, nations,...|(262144,[2801,606...|
|[sunday, abcs, we...|(262144,[1667,410...|
|[speaker, ryans, ...|(262144,[4871,991...|
|[conservative, fi...|(262144,[3834,910...|
|[sunday, nbcs, me...|(262144,[6781,761...|
|[tel, aviv, jewis...|(262144,[1076,197...|
|[although, nra, a...|(262144,[622,4200...|
|[friday, hugh, he...|(262144,[4349,579...|
|[protest, movemen...|(262144,[535,1461...|
|[tuesday, press, ...|(262144,[1882,273...|
|[players, protest...|(262144,[619,4037...|
|[magazine, teen, ...|(262144,[2

In [21]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
ft_idf = idf.fit(tf_df).transform(tf_df)
ft_idf.select(['refined_tokens', 'features']).show()

+--------------------+--------------------+
|      refined_tokens|            features|
+--------------------+--------------------+
|[mars, tech, conf...|(262144,[4054,487...|
|[remember, people...|(262144,[408,6051...|
|[billionaire, geo...|(262144,[1973,232...|
|[fridays, breitba...|(262144,[14,619,6...|
|[piedras, negras,...|(262144,[2281,271...|
|[saturday, town, ...|(262144,[7612,102...|
|[washington, pres...|(262144,[619,646,...|
|[white, house, pr...|(262144,[7612,102...|
|[anthem, nations,...|(262144,[2801,606...|
|[sunday, abcs, we...|(262144,[1667,410...|
|[speaker, ryans, ...|(262144,[4871,991...|
|[conservative, fi...|(262144,[3834,910...|
|[sunday, nbcs, me...|(262144,[6781,761...|
|[tel, aviv, jewis...|(262144,[1076,197...|
|[although, nra, a...|(262144,[622,4200...|
|[friday, hugh, he...|(262144,[4349,579...|
|[protest, movemen...|(262144,[535,1461...|
|[tuesday, press, ...|(262144,[1882,273...|
|[players, protest...|(262144,[619,4037...|
|[magazine, teen, ...|(262144,[2

In [22]:
kmeans = KMeans(k=25)
kmeans_model = kmeans.fit(ft_idf).transform(ft_idf)
kmeans_model.select('refined_tokens', 'features', 'prediction').show()

+--------------------+--------------------+----------+
|      refined_tokens|            features|prediction|
+--------------------+--------------------+----------+
|[rayshell, byers,...|(262144,[619,813,...|        13|
|[sunday, cbss, fa...|(262144,[3924,678...|        13|
|[president, donal...|(262144,[5381,576...|        13|
|[sunday, cbss, fa...|(262144,[6369,761...|        13|
|[thursday, editio...|(262144,[4954,761...|        13|
|[never, trump, mo...|(262144,[14,619,1...|        13|
|[man, memphis, te...|(262144,[619,1727...|        13|
|[know, going, die...|(262144,[1727,184...|        13|
|[georgia, lawmake...|(262144,[513,2548...|        13|
|[pres, trump, tap...|(262144,[6781,761...|        13|
|[according, polit...|(262144,[440,4872...|        13|
|[report, week, cn...|(262144,[1846,232...|        13|
|[death, american,...|(262144,[1846,243...|        13|
|[mother, american...|(262144,[836,994,...|        13|
|[years, old, yet,...|(262144,[571,991,...|        13|
|[times, i

In [23]:
lda = LDA(k=25, seed=100, optimizer="em", featuresCol="features")
lda_model = lda.fit(kmeans_model).transform(kmeans_model)
lda_model.select('refined_tokens', 'features', 'prediction', 'topicDistribution').show()

+--------------------+--------------------+----------+--------------------+
|      refined_tokens|            features|prediction|   topicDistribution|
+--------------------+--------------------+----------+--------------------+
|[rayshell, byers,...|(262144,[619,813,...|        13|[0.00394299711127...|
|[sunday, cbss, fa...|(262144,[3924,678...|        13|[0.01087772223647...|
|[president, donal...|(262144,[5381,576...|        13|[0.01548647714208...|
|[sunday, cbss, fa...|(262144,[6369,761...|        13|[0.01070375926400...|
|[thursday, editio...|(262144,[4954,761...|        13|[0.00858246161785...|
|[never, trump, mo...|(262144,[14,619,1...|        13|[0.00201197972260...|
|[man, memphis, te...|(262144,[619,1727...|        13|[0.00605981755409...|
|[know, going, die...|(262144,[1727,184...|        13|[0.00307656632882...|
|[georgia, lawmake...|(262144,[513,2548...|        13|[0.00669172619761...|
|[pres, trump, tap...|(262144,[6781,761...|        13|[0.00906105075052...|
|[according,

In [24]:
lda_model.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|        13|  377|
|        21|    1|
|         0|    3|
|        15|    3|
|         3|    1|
|        14|    1|
|        11|    2|
|        12|    2|
|         4|    1|
|        16|    1|
|         2|    1|
|        17|    1|
|        10|    1|
|        23|    1|
|        22|    1|
|         1|    1|
|        19|    1|
|        18|    1|
+----------+-----+

