In [1]:
# first remove all files with length 0
# hdfs dfs -rm $(hdfs dfs -ls -R /twitter/movie/<moviedir> | grep -v "^d" | awk '{if ($5 == 0) print $8}')

In [1]:
from pyspark.sql.functions import udf, col, avg

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def negative(text):
    return analyzer.polarity_scores(text).get('neg')

def positive(text):
    return analyzer.polarity_scores(text).get('pos')

def neutral(text):
    return analyzer.polarity_scores(text).get('neu')

def compound(text):
    return analyzer.polarity_scores(text).get('compound')
    
negative_udf = udf(negative)
positive_udf = udf(positive)
neutral_udf = udf(neutral)
compound_udf = udf(compound)

In [14]:
ds = spark.read.json('/twitter/movie/DeerAntMan/*.gz')

In [5]:
ds.describe()

DataFrame[summary: string, body: string, favoritesCount: string, id: string, link: string, objectType: string, postedTime: string, retweetCount: string, twitter_filter_level: string, twitter_lang: string, verb: string]

In [6]:
ds_text_only = ds.select('body').na.drop()

In [7]:
ds_text_only.show(1)

+--------------------+
|                body|
+--------------------+
|RT @Kotaku: The n...|
+--------------------+
only showing top 1 row



In [8]:
sentiment = ds_text_only.withColumn('neg', negative_udf(col('body'))).withColumn('neu', neutral_udf(col('body'))).withColumn('pos', positive_udf(col('body'))).withColumn('comp', compound_udf(col('body')))

In [9]:
sentiment.show(1)

+--------------------+---+-----+-----+------+
|                body|neg|  neu|  pos|  comp|
+--------------------+---+-----+-----+------+
|RT @Kotaku: The n...|0.0|0.681|0.319|0.7269|
+--------------------+---+-----+-----+------+
only showing top 1 row



In [13]:
sentiment.describe('neg', 'neu', 'pos', 'comp').show()

+-------+-------------------+------------------+-------------------+------------------+
|summary|                neg|               neu|                pos|              comp|
+-------+-------------------+------------------+-------------------+------------------+
|  count|             181507|            181507|             181507|            181507|
|   mean|0.03430100216520573|0.8420712424314195|0.12363143570220403|0.2023808663026771|
| stddev|0.08203495454158365|0.1612844648404085|0.14849918431304707|0.3889369978198501|
|    min|                0.0|             0.139|                0.0|           -0.0015|
|    max|               0.75|               1.0|              0.861|            0.9872|
+-------+-------------------+------------------+-------------------+------------------+



In [11]:
import time
time.ctime() # 'Mon Oct 18 13:35:29 2010'

'Wed Sep 20 15:32:44 2017'

In [12]:
pwd

u'/home/lachlan'