In [1]:
import pandas as pd
import re 
import numpy
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import * 
from pyspark.sql.functions import length
from pyspark.sql.functions import udf
from pyspark.sql import functions as f

# Ingesta de datos de hdsf en dataframes

In [2]:
csv = '/user/jlondo97/datasets/articles1.csv'
df1 = spark.read.csv(csv)
csv = '/user/jlondo97/datasets/articles2.csv'
df2 = spark.read.csv(csv)
csv = '/user/jlondo97/datasets/articles3.csv'
df3 = spark.read.csv(csv)
# df1.show()
# df2.show()
# df3.show()

In [3]:
join_1_df_2 = df1.join(df2, on=['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9'], how='left_outer')
full_df = join_1_df_2.join(df3, on=['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9'], how='left_outer')
full_df = full_df.selectExpr("_c1 as ID", "_c2 as Title", "_c3 as Publication", "_c4 as Author", "_c5 as Publication_date", "_c6 as Year", "_c7 as Month", "_c8 as URL", "_c9 as Content")
full_df.show()

+-----+--------------------+--------------+--------------------+----------------+------+-----+----+--------------------+
|   ID|               Title|   Publication|              Author|Publication_date|  Year|Month| URL|             Content|
+-----+--------------------+--------------+--------------------+----------------+------+-----+----+--------------------+
|28828|Watch: Amazon Bos...|     Breitbart|         Nate Church|      2017-03-21|2017.0|  3.0|null|At the MARS 2017 ...|
|28837|Patriots Owner Ro...|     Breitbart|         Trent Baker|      2017-02-03|2017.0|  2.0|null|”I remember who t...|
|28972|Report: George So...|     Breitbart|         Aaron Klein|      2017-01-23|2017.0|  1.0|null|Billionaire Georg...|
|29249|Peter Schweizer: ...|     Breitbart|        John Hayward|      2017-05-26|2017.0|  5.0|null|On Friday’s Breit...|
|29344|Mexican Border St...|     Breitbart|   Cartel Chronicles|      2017-02-19|2017.0|  2.0|null|PIEDRAS NEGRAS, C...|
|29382|Maxine Waters: ’D...|    

# Limpieza del DataFrame
Creando un dataframe que contenga los contedidos de las publicaciones hechas y limpiando el contenido de caracteres especiales.

In [4]:
reg = '[^a-zA-Z ]'
reg1 = '[\s*]{1,}'

In [5]:
full_df = full_df.withColumn("clean", regexp_replace('Content', reg ,""))
full_df.select('Content','clean').show()

+--------------------+--------------------+
|             Content|               clean|
+--------------------+--------------------+
|At the MARS 2017 ...|At the MARS  tech...|
|”I remember who t...|I remember who th...|
|Billionaire Georg...|Billionaire Georg...|
|On Friday’s Breit...|On Fridays Breitb...|
|PIEDRAS NEGRAS, C...|PIEDRAS NEGRAS Co...|
|Saturday at a tow...|Saturday at a tow...|
|WASHINGTON  —   P...|WASHINGTON     Pr...|
|White House press...|White House press...|
|Anthem, the natio...|Anthem the nation...|
|Sunday on ABC’s “...|Sunday on ABCs Th...|
|Speaker Ryan’s pl...|Speaker Ryans pla...|
|Conservative fire...|Conservative fire...|
|Sunday on NBC’s “...|Sunday on NBCs Me...|
|TEL AVIV  —   A J...|TEL AVIV     A Je...|
|Although the NRA ...|Although the NRA ...|
|Friday on Hugh He...|Friday on Hugh He...|
|While the protest...|While the protest...|
|At a Tuesday pres...|At a Tuesday pres...|
|Players that prot...|Players that prot...|
|  magazine Teen V...|  magazine

In [6]:
full_df = full_df.withColumn("clean1", regexp_replace('clean', reg1 ," "))
full_df.select('clean','clean1').show()

+--------------------+--------------------+
|               clean|              clean1|
+--------------------+--------------------+
|At the MARS  tech...|At the MARS tech ...|
|I remember who th...|I remember who th...|
|Billionaire Georg...|Billionaire Georg...|
|On Fridays Breitb...|On Fridays Breitb...|
|PIEDRAS NEGRAS Co...|PIEDRAS NEGRAS Co...|
|Saturday at a tow...|Saturday at a tow...|
|WASHINGTON     Pr...|WASHINGTON Presid...|
|White House press...|White House press...|
|Anthem the nation...|Anthem the nation...|
|Sunday on ABCs Th...|Sunday on ABCs Th...|
|Speaker Ryans pla...|Speaker Ryans pla...|
|Conservative fire...|Conservative fire...|
|Sunday on NBCs Me...|Sunday on NBCs Me...|
|TEL AVIV     A Je...|TEL AVIV A Jewish...|
|Although the NRA ...|Although the NRA ...|
|Friday on Hugh He...|Friday on Hugh He...|
|While the protest...|While the protest...|
|At a Tuesday pres...|At a Tuesday pres...|
|Players that prot...|Players that prot...|
|  magazine Teen V...| magazine 

## Tokenización de los contenidos de las publicaciones
Creacion de un dataframe con el contenido de la publicacion tokenizado 

In [7]:
tokenization=Tokenizer(inputCol='clean1',outputCol='tokens')

In [8]:
tokenized_df = tokenization.transform(full_df)

In [9]:
tokenized_df.select('clean1','tokens').show()

+--------------------+--------------------+
|              clean1|              tokens|
+--------------------+--------------------+
|At the MARS tech ...|[at, the, mars, t...|
|I remember who th...|[i, remember, who...|
|Billionaire Georg...|[billionaire, geo...|
|On Fridays Breitb...|[on, fridays, bre...|
|PIEDRAS NEGRAS Co...|[piedras, negras,...|
|Saturday at a tow...|[saturday, at, a,...|
|WASHINGTON Presid...|[washington, pres...|
|White House press...|[white, house, pr...|
|Anthem the nation...|[anthem, the, nat...|
|Sunday on ABCs Th...|[sunday, on, abcs...|
|Speaker Ryans pla...|[speaker, ryans, ...|
|Conservative fire...|[conservative, fi...|
|Sunday on NBCs Me...|[sunday, on, nbcs...|
|TEL AVIV A Jewish...|[tel, aviv, a, je...|
|Although the NRA ...|[although, the, n...|
|Friday on Hugh He...|[friday, on, hugh...|
|While the protest...|[while, the, prot...|
|At a Tuesday pres...|[at, a, tuesday, ...|
|Players that prot...|[players, that, p...|
| magazine Teen Vo...|[, magazin

## Eliminar stopWords
Eliminación de stopWord en la columna de contenido de las publicaciones, token tales como "I, and .or"

In [104]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [105]:
refined_df=stopword_removal.transform(tokenized_df)

In [106]:
refined_df.select('clean1','tokens','refined_tokens').show()

+--------------------+--------------------+--------------------+
|              clean1|              tokens|      refined_tokens|
+--------------------+--------------------+--------------------+
|At the MARS tech ...|[at, the, mars, t...|[mars, tech, conf...|
|I remember who th...|[i, remember, who...|[remember, people...|
|Billionaire Georg...|[billionaire, geo...|[billionaire, geo...|
|On Fridays Breitb...|[on, fridays, bre...|[fridays, breitba...|
|PIEDRAS NEGRAS Co...|[piedras, negras,...|[piedras, negras,...|
|Saturday at a tow...|[saturday, at, a,...|[saturday, town, ...|
|WASHINGTON Presid...|[washington, pres...|[washington, pres...|
|White House press...|[white, house, pr...|[white, house, pr...|
|Anthem the nation...|[anthem, the, nat...|[anthem, nations,...|
|Sunday on ABCs Th...|[sunday, on, abcs...|[sunday, abcs, we...|
|Speaker Ryans pla...|[speaker, ryans, ...|[speaker, ryans, ...|
|Conservative fire...|[conservative, fi...|[conservative, fi...|
|Sunday on NBCs Me...|[su

## Vectorización del DataFrame

In [107]:
refined_df

DataFrame[ID: string, Title: string, Publication: string, Author: string, Publication_date: string, Year: string, Month: string, URL: string, Content: string, clean: string, clean1: string, tokens: array<string>, refined_tokens: array<string>]

In [108]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [109]:
len_udf = udf(lambda s: len(s), IntegerType())
refined_df = refined_df.withColumn("token_count", len_udf(col('refined_tokens')))

In [110]:
refined_df.select('clean1','tokens','refined_tokens','token_count').show(50)

+--------------------+--------------------+--------------------+-----------+
|              clean1|              tokens|      refined_tokens|token_count|
+--------------------+--------------------+--------------------+-----------+
|At the MARS tech ...|[at, the, mars, t...|[mars, tech, conf...|        233|
|I remember who th...|[i, remember, who...|[remember, people...|         61|
|Billionaire Georg...|[billionaire, geo...|[billionaire, geo...|        392|
|On Fridays Breitb...|[on, fridays, bre...|[fridays, breitba...|        551|
|PIEDRAS NEGRAS Co...|[piedras, negras,...|[piedras, negras,...|        192|
|Saturday at a tow...|[saturday, at, a,...|[saturday, town, ...|         51|
|WASHINGTON Presid...|[washington, pres...|[washington, pres...|        905|
|White House press...|[white, house, pr...|[white, house, pr...|        175|
|Anthem the nation...|[anthem, the, nat...|[anthem, nations,...|        174|
|Sunday on ABCs Th...|[sunday, on, abcs...|[sunday, abcs, we...|        106|

## Agrupación de textos

In [111]:
from pyspark.ml.feature import CountVectorizer

In [123]:
aux_df = refined_df
aux_df.show()

+-----+--------------------+--------------+--------------------+----------------+------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|   ID|               Title|   Publication|              Author|Publication_date|  Year|Month| URL|             Content|               clean|              clean1|              tokens|      refined_tokens|token_count|
+-----+--------------------+--------------+--------------------+----------------+------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|28828|Watch: Amazon Bos...|     Breitbart|         Nate Church|      2017-03-21|2017.0|  3.0|null|At the MARS 2017 ...|At the MARS  tech...|At the MARS tech ...|[at, the, mars, t...|[mars, tech, conf...|        233|
|28837|Patriots Owner Ro...|     Breitbart|         Trent Baker|      2017-02-03|2017.0|  2.0|null|”I remember who t...|I remember w

In [124]:
cv = CountVectorizer(inputCol="refined_tokens", outputCol="rawFeatures" )
cvmodel = cv.fit(aux_df.limit(20))

Py4JJavaError: An error occurred while calling o2150.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 22 in stage 170.0 failed 4 times, most recent failure: Lost task 22.3 in stage 170.0 (TID 696, hdpjupyter.dis.eafit.edu.co, executor 1): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:148)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1639)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1638)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1638)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1872)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2039)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2060)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2079)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2104)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1168)
	at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:176)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:148)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.NullPointerException


NameError: name 'HashingTF' is not defined