In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, expr, when, concat, lit, isnan
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.feature import VectorIndexer, VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.clustering import LDA, BisectingKMeans
import pyspark

In [2]:
!pip install TextBlob



In [3]:
spark

In [4]:
from textblob import TextBlob, Word

In [5]:
df=spark.read.csv("news.csv", inferSchema=True, header=True)

In [5]:
df.show(10)

+---+-------+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
| id|id_news|               title|   publication|              author|      date|  year|month| url|             content|
+---+-------+--------------------+--------------+--------------------+----------+------+-----+----+--------------------+
|  0|  17283|House Republicans...|New York Times|          Carl Hulse|2016-12-31|2016.0| 12.0|null|WASHINGTON  —   C...|
|  1|  17284|Rift Between Offi...|New York Times|Benjamin Mueller ...|2017-06-19|2017.0|  6.0|null|After the bullet ...|
|  2|  17285|Tyrus Wong, ‘Bamb...|New York Times|        Margalit Fox|2017-01-06|2017.0|  1.0|null|When Walt Disney’...|
|  3|  17286|Among Deaths in 2...|New York Times|    William McDonald|2017-04-10|2017.0|  4.0|null|Death may be the ...|
|  4|  17287|Kim Jong-un Says ...|New York Times|       Choe Sang-Hun|2017-01-02|2017.0|  1.0|null|SEOUL, South Kore...|
|  5|  17288|Sick With a Cold,..

In [11]:
df.createOrReplaceTempView("news")

In [12]:
sqlDF = spark.sql("SELECT * FROM news where id = 0")
sqlDF.show(19)

+---+-------+--------------------+--------------+----------+----------+------+-----+----+--------------------+
| id|id_news|               title|   publication|    author|      date|  year|month| url|             content|
+---+-------+--------------------+--------------+----------+----------+------+-----+----+--------------------+
|  0|  17283|House Republicans...|New York Times|Carl Hulse|2016-12-31|2016.0| 12.0|null|WASHINGTON  —   C...|
+---+-------+--------------------+--------------+----------+----------+------+-----+----+--------------------+



In [13]:
#df.write.parquet("csv_to_paraquet")
df_1 = spark.read.option("header","true").parquet("csv_to_paraquet")

In [14]:
df_1.printSchema()

root
 |-- id: string (nullable = true)
 |-- id_news: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publication: string (nullable = true)
 |-- author: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- url: string (nullable = true)
 |-- content: string (nullable = true)



In [15]:
sqlDF = spark.sql("SELECT id, title, content FROM news where id < 5")
sqlDF.show(19)

+---+--------------------+--------------------+
| id|               title|             content|
+---+--------------------+--------------------+
|  0|House Republicans...|WASHINGTON  —   C...|
|  1|Rift Between Offi...|After the bullet ...|
|  2|Tyrus Wong, ‘Bamb...|When Walt Disney’...|
|  3|Among Deaths in 2...|Death may be the ...|
|  4|Kim Jong-un Says ...|SEOUL, South Kore...|
+---+--------------------+--------------------+



In [16]:
rz = sqlDF[['id','title','content']].rdd.map(lambda x: (x.id,x.title, TextBlob(x.content).sentiment)).collect()
rz

[('0',
  'House Republicans Fret About Winning Their Health Care Suit - The New York Times',
  Sentiment(polarity=0.028841991341991335, subjectivity=0.4555353959765724)),
 ('1',
  'Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times',
  Sentiment(polarity=-0.0017503893246467515, subjectivity=0.3917658789688492)),
 ('2',
  'Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial Bias, Dies at 106 - The New York Times',
  Sentiment(polarity=0.0679717149170274, subjectivity=0.4362909677128429)),
 ('3',
  'Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times',
  Sentiment(polarity=0.1297952794444023, subjectivity=0.43564610178645263)),
 ('4',
  'Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times',
  Sentiment(polarity=0.1864267676767677, subjectivity=0.5192234848484848))]