# Analisis de sentimientos 

##  Importaciones de bibliotecas y librerias

In [127]:
sc

In [110]:
import pandas as pd
import re 
import numpy
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import * 
from pyspark.sql.functions import length
from pyspark.sql.functions import udf
from pyspark.sql import functions as f

# Ingesta de datos de hdsf en dataframes

In [111]:
csv = '/user/jlondo97/datasets/articles1.csv'
df1 = spark.read.csv(csv)
df1.show()

+----+-----+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
| _c0|  _c1|                 _c2|           _c3|                 _c4|       _c5|   _c6|  _c7| _c8|                 _c9|
+----+-----+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
|null|   id|               title|   publication|              author|      date|  year|month| url|             content|
|   0|17283|House Republicans...|New York Times|          Carl Hulse|2016-12-31|2016.0| 12.0|null|WASHINGTON  —   C...|
|   1|17284|Rift Between Offi...|New York Times|Benjamin Mueller ...|2017-06-19|2017.0|  6.0|null|After the bullet ...|
|   2|17285|Tyrus Wong, ‘Bamb...|New York Times|        Margalit Fox|2017-01-06|2017.0|  1.0|null|When Walt Disney’...|
|   3|17286|Among Deaths in 2...|New York Times|    William McDonald|2017-04-10|2017.0|  4.0|null|Death may be the ...|
|   4|17287|Kim Jong-un Says ...|New Yor

Definicion de expresiones regulares para la limpieza de los contenidos de las diferentes publicaciones 


In [112]:
reg = '[^a-zA-Z ]'
reg1 = '[\s*]{1,}'

# Limpieza del DataFrame

Creando un dataframe que contenga los contedidos de las publicaciones hechas y limpiando el contenido de caracteres especiales.

In [113]:
df_1 = df1.withColumn("clean", regexp_replace('_c9', reg ,""))
df_1.select('_c9','clean').show()

+--------------------+--------------------+
|                 _c9|               clean|
+--------------------+--------------------+
|             content|             content|
|WASHINGTON  —   C...|WASHINGTON     Co...|
|After the bullet ...|After the bullet ...|
|When Walt Disney’...|When Walt Disneys...|
|Death may be the ...|Death may be the ...|
|SEOUL, South Kore...|SEOUL South Korea...|
|LONDON  —   Queen...|LONDON     Queen ...|
|BEIJING  —   Pres...|BEIJING     Presi...|
|Danny Cahill stoo...|Danny Cahill stoo...|
|Just how   is Hil...|Just how   is Hil...|
|Angels are everyw...|Angels are everyw...|
|With Donald J. Tr...|With Donald J Tru...|
|THOMPSONS, Tex.  ...|THOMPSONS Tex    ...|
|WEST PALM BEACH, ...|WEST PALM BEACH F...|
|This article is p...|This article is p...|
|It’s the season f...|Its the season fo...|
|Finally. The Seco...|Finally The Secon...|
|  pages into the ...|  pages into the ...|
|MUMBAI, India  — ...|MUMBAI India     ...|
|BAGHDAD  —   A su...|BAGHDAD   

En las siguientes lineas se eliminan las espacios en blanco(\s) de mas que son tokenisados como tokens independientes

In [114]:
df_2 = df_1.withColumn("clean1", regexp_replace('clean', reg1 ," "))
df_2.select('clean','clean1').show()

+--------------------+--------------------+
|               clean|              clean1|
+--------------------+--------------------+
|             content|             content|
|WASHINGTON     Co...|WASHINGTON Congre...|
|After the bullet ...|After the bullet ...|
|When Walt Disneys...|When Walt Disneys...|
|Death may be the ...|Death may be the ...|
|SEOUL South Korea...|SEOUL South Korea...|
|LONDON     Queen ...|LONDON Queen Eliz...|
|BEIJING     Presi...|BEIJING President...|
|Danny Cahill stoo...|Danny Cahill stoo...|
|Just how   is Hil...|Just how is Hilla...|
|Angels are everyw...|Angels are everyw...|
|With Donald J Tru...|With Donald J Tru...|
|THOMPSONS Tex    ...|THOMPSONS Tex Can...|
|WEST PALM BEACH F...|WEST PALM BEACH F...|
|This article is p...|This article is p...|
|Its the season fo...|Its the season fo...|
|Finally The Secon...|Finally The Secon...|
|  pages into the ...| pages into the j...|
|MUMBAI India     ...|MUMBAI India It w...|
|BAGHDAD     A sui...|BAGHDAD A 

## Tokenización de los contenidos de las publicaciones

Creacion de un dataframe con el contenido de la publicacion tokenizado 

In [115]:
tokenized_df=tokenization.transform(df_2)


In [116]:
tokenization=Tokenizer(inputCol='clean1',outputCol='tokens')

In [117]:
tokenized_df.select('clean1','tokens').show()

+--------------------+--------------------+
|              clean1|              tokens|
+--------------------+--------------------+
|             content|           [content]|
|WASHINGTON Congre...|[washington, cong...|
|After the bullet ...|[after, the, bull...|
|When Walt Disneys...|[when, walt, disn...|
|Death may be the ...|[death, may, be, ...|
|SEOUL South Korea...|[seoul, south, ko...|
|LONDON Queen Eliz...|[london, queen, e...|
|BEIJING President...|[beijing, preside...|
|Danny Cahill stoo...|[danny, cahill, s...|
|Just how is Hilla...|[just, how, is, h...|
|Angels are everyw...|[angels, are, eve...|
|With Donald J Tru...|[with, donald, j,...|
|THOMPSONS Tex Can...|[thompsons, tex, ...|
|WEST PALM BEACH F...|[west, palm, beac...|
|This article is p...|[this, article, i...|
|Its the season fo...|[its, the, season...|
|Finally The Secon...|[finally, the, se...|
| pages into the j...|[, pages, into, t...|
|MUMBAI India It w...|[mumbai, india, i...|
|BAGHDAD A suicide...|[baghdad, 

## Remoción de stopWords

Eliminacion de stopWord de los contenidos de las publicaciones token tales como "I, and .or" 

In [118]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [119]:
refined_df=stopword_removal.transform(tokenized_df)

In [120]:
refined_df.select('clean1','tokens','refined_tokens').show()

+--------------------+--------------------+--------------------+
|              clean1|              tokens|      refined_tokens|
+--------------------+--------------------+--------------------+
|             content|           [content]|           [content]|
|WASHINGTON Congre...|[washington, cong...|[washington, cong...|
|After the bullet ...|[after, the, bull...|[bullet, shells, ...|
|When Walt Disneys...|[when, walt, disn...|[walt, disneys, b...|
|Death may be the ...|[death, may, be, ...|[death, may, grea...|
|SEOUL South Korea...|[seoul, south, ko...|[seoul, south, ko...|
|LONDON Queen Eliz...|[london, queen, e...|[london, queen, e...|
|BEIJING President...|[beijing, preside...|[beijing, preside...|
|Danny Cahill stoo...|[danny, cahill, s...|[danny, cahill, s...|
|Just how is Hilla...|[just, how, is, h...|[hillary, kerr, f...|
|Angels are everyw...|[angels, are, eve...|[angels, everywhe...|
|With Donald J Tru...|[with, donald, j,...|[donald, j, trump...|
|THOMPSONS Tex Can...|[th

## vectorizando 

In [121]:
refined_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, clean: string, clean1: string, tokens: array<string>, refined_tokens: array<string>]

In [122]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [123]:
len_udf = udf(lambda s: len(s), IntegerType())
refined_df = refined_df.withColumn("token_count", len_udf(col('refined_tokens')))

In [124]:
refined_df.select('clean1','tokens','refined_tokens','token_count').show()

+--------------------+--------------------+--------------------+-----------+
|              clean1|              tokens|      refined_tokens|token_count|
+--------------------+--------------------+--------------------+-----------+
|             content|           [content]|           [content]|          1|
|WASHINGTON Congre...|[washington, cong...|[washington, cong...|        493|
|After the bullet ...|[after, the, bull...|[bullet, shells, ...|       2553|
|When Walt Disneys...|[when, walt, disn...|[walt, disneys, b...|       1286|
|Death may be the ...|[death, may, be, ...|[death, may, grea...|       1182|
|SEOUL South Korea...|[seoul, south, ko...|[seoul, south, ko...|        418|
|LONDON Queen Eliz...|[london, queen, e...|[london, queen, e...|         86|
|BEIJING President...|[beijing, preside...|[beijing, preside...|        345|
|Danny Cahill stoo...|[danny, cahill, s...|[danny, cahill, s...|       1618|
|Just how is Hilla...|[just, how, is, h...|[hillary, kerr, f...|        879|