In [142]:
import pandas as pd
import ijson

In [143]:
time = []
text = []
with open('data.json', 'r') as f:
    objects = ijson.items(f, 'results.item')
    for row in objects:
        if row['text'].startswith("RT") == False:
            time.append(row['created_at'])
            text.append(row['text'])

data = pd.DataFrame({"text":text,"time":time},columns=["text","time"])
data['time'] = pd.to_datetime(data['time']).dt.date

<class '_yajl2.items'>


In [144]:
data

Unnamed: 0,text,time
0,Brilliant thread. I particularly like Wile. E....,2020-08-31
1,@TimWilsonMP Have the courage to tell the trut...,2020-08-31
2,@Rdene915 @ChristineBemis2 Build it and they w...,2020-08-31
3,A photo of you in January 2020 without imagini...,2020-08-31
4,@IndyCat14 @JoyOfCats @carolineCMCE @LordGrayd...,2020-08-31
...,...,...
95,@chrisedmond Well said Chris: economists alway...,2020-08-31
96,@laureningram Just the hoop crew hanging out i...,2020-08-31
97,@heidimur @VicParliament PS. I'm sure this rep...,2020-08-31
98,@kirtidakale @goformative Wow always so good p...,2020-08-31


In [145]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob
from pyspark.sql import SQLContext

In [146]:
import findspark
findspark.init()

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re

# remove non ASCII characters
def strip_non_ascii(data_str):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in data_str if 0 < ord(c) < 127)
    return ''.join(stripped)
# setup pyspark udf function
strip_non_ascii_udf = udf(strip_non_ascii, StringType())

In [147]:
df = spark.createDataFrame(data)
df = df.withColumn('text_non_asci',strip_non_ascii_udf(df['text']))
df.show()

+--------------------+----------+--------------------+
|                text|      time|       text_non_asci|
+--------------------+----------+--------------------+
|Brilliant thread....|2020-08-31|Brilliant thread....|
|@TimWilsonMP Have...|2020-08-31|@TimWilsonMP Have...|
|@Rdene915 @Christ...|2020-08-31|@Rdene915 @Christ...|
|A photo of you in...|2020-08-31|A photo of you in...|
|@IndyCat14 @JoyOf...|2020-08-31|@IndyCat14 @JoyOf...|
|@BuckleyIOP It re...|2020-08-31|@BuckleyIOP It re...|
|@kruevans But you...|2020-08-31|@kruevans But you...|
|@johniadarola I d...|2020-08-31|@johniadarola I d...|
|My sister turns 7...|2020-08-31|My sister turns 7...|
|@kruevans It’s sh...|2020-08-31|@kruevans Its shi...|
|@runwader Cannot ...|2020-08-31|@runwader Cannot ...|
|@abcmelbourne mor...|2020-08-31|@abcmelbourne mor...|
|#VICTORIA #Parlia...|2020-08-31|#VICTORIA #Parlia...|
|@profsarahj Sadly...|2020-08-31|@profsarahj Sadly...|
|@ChristineBemis2 ...|2020-08-31|@ChristineBemis2 ...|
|@Monmouth

In [148]:
# fixed abbreviation
def fix_abbreviation(data_str):
    data_str = data_str.lower()
    data_str = re.sub(r'\bthats\b', 'that is', data_str)
    data_str = re.sub(r'\bive\b', 'i have', data_str)
    data_str = re.sub(r'\bim\b', 'i am', data_str)
    data_str = re.sub(r'\bya\b', 'yeah', data_str)
    data_str = re.sub(r'\bcant\b', 'can not', data_str)
    data_str = re.sub(r'\bdont\b', 'do not', data_str)
    data_str = re.sub(r'\bwont\b', 'will not', data_str)
    data_str = re.sub(r'\bid\b', 'i would', data_str)
    data_str = re.sub(r'wtf', 'what the fuck', data_str)
    data_str = re.sub(r'\bwth\b', 'what the hell', data_str)
    data_str = re.sub(r'\br\b', 'are', data_str)
    data_str = re.sub(r'\bu\b', 'you', data_str)
    data_str = re.sub(r'\bk\b', 'OK', data_str)
    data_str = re.sub(r'\bsux\b', 'sucks', data_str)
    data_str = re.sub(r'\bno+\b', 'no', data_str)
    data_str = re.sub(r'\bcoo+\b', 'cool', data_str)
    data_str = re.sub(r'rt\b', '', data_str)
    data_str = data_str.strip()
    return data_str

fix_abbreviation_udf = udf(fix_abbreviation, StringType())

In [149]:
df = df.withColumn('fixed_abbrev',fix_abbreviation_udf(df['text_non_asci']))
df.show(5,True)

+--------------------+----------+--------------------+--------------------+
|                text|      time|       text_non_asci|        fixed_abbrev|
+--------------------+----------+--------------------+--------------------+
|Brilliant thread....|2020-08-31|Brilliant thread....|brilliant thread....|
|@TimWilsonMP Have...|2020-08-31|@TimWilsonMP Have...|@timwilsonmp have...|
|@Rdene915 @Christ...|2020-08-31|@Rdene915 @Christ...|@rdene915 @christ...|
|A photo of you in...|2020-08-31|A photo of you in...|a photo of you in...|
|@IndyCat14 @JoyOf...|2020-08-31|@IndyCat14 @JoyOf...|@indycat14 @joyof...|
+--------------------+----------+--------------------+--------------------+
only showing top 5 rows



In [150]:
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 1 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    # remove unwanted space, *.split() will automatically split on
    # whitespace and discard duplicates, the " ".join() joins the
    # resulting list into one string.
    return " ".join(cleaned_str.split())
# setup pyspark udf function
remove_features_udf = udf(remove_features, StringType())

In [151]:
df = df.withColumn('removed',remove_features_udf(df['fixed_abbrev']))
df.show(5,True)

+--------------------+----------+--------------------+--------------------+--------------------+
|                text|      time|       text_non_asci|        fixed_abbrev|             removed|
+--------------------+----------+--------------------+--------------------+--------------------+
|Brilliant thread....|2020-08-31|Brilliant thread....|brilliant thread....|brilliant thread ...|
|@TimWilsonMP Have...|2020-08-31|@TimWilsonMP Have...|@timwilsonmp have...|have the courage ...|
|@Rdene915 @Christ...|2020-08-31|@Rdene915 @Christ...|@rdene915 @christ...|build it and they...|
|A photo of you in...|2020-08-31|A photo of you in...|a photo of you in...|photo of you in j...|
|@IndyCat14 @JoyOf...|2020-08-31|@IndyCat14 @JoyOf...|@indycat14 @joyof...|                    |
+--------------------+----------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [152]:
from pyspark.sql.types import FloatType

from textblob import TextBlob

def sentiment_analysis(text):
    return TextBlob(text).sentiment.polarity

sentiment_analysis_udf = udf(sentiment_analysis , FloatType())

In [153]:
df  = df.withColumn("sentiment_score", sentiment_analysis_udf( df['removed'] ))
df.show(5,True)

+--------------------+----------+--------------------+--------------------+--------------------+---------------+
|                text|      time|       text_non_asci|        fixed_abbrev|             removed|sentiment_score|
+--------------------+----------+--------------------+--------------------+--------------------+---------------+
|Brilliant thread....|2020-08-31|Brilliant thread....|brilliant thread....|brilliant thread ...|     0.53333336|
|@TimWilsonMP Have...|2020-08-31|@TimWilsonMP Have...|@timwilsonmp have...|have the courage ...|            0.0|
|@Rdene915 @Christ...|2020-08-31|@Rdene915 @Christ...|@rdene915 @christ...|build it and they...|            0.0|
|A photo of you in...|2020-08-31|A photo of you in...|a photo of you in...|photo of you in j...|            0.0|
|@IndyCat14 @JoyOf...|2020-08-31|@IndyCat14 @JoyOf...|@indycat14 @joyof...|                    |            0.0|
+--------------------+----------+--------------------+--------------------+--------------------+

In [154]:
def condition(r):
    if (r >=0.1):
        label = "positive"
    elif(r <= -0.1):
        label = "negative"
    else:
        label = "neutral"
    return label

sentiment_udf = udf(lambda x: condition(x), StringType())
df  = df.withColumn("sentiment", sentiment_udf( df['sentiment_score'] ))
SA_results = df.select('text','time','sentiment_score','sentiment')

In [155]:
SA_results.show(90)

+--------------------+----------+---------------+---------+
|                text|      time|sentiment_score|sentiment|
+--------------------+----------+---------------+---------+
|Brilliant thread....|2020-08-31|     0.53333336| positive|
|@TimWilsonMP Have...|2020-08-31|            0.0|  neutral|
|@Rdene915 @Christ...|2020-08-31|            0.0|  neutral|
|A photo of you in...|2020-08-31|            0.0|  neutral|
|@IndyCat14 @JoyOf...|2020-08-31|            0.0|  neutral|
|@BuckleyIOP It re...|2020-08-31|     0.14621212| positive|
|@kruevans But you...|2020-08-31|            0.0|  neutral|
|@johniadarola I d...|2020-08-31|            0.4| positive|
|My sister turns 7...|2020-08-31|            0.0|  neutral|
|@kruevans It’s sh...|2020-08-31|            0.0|  neutral|
|@runwader Cannot ...|2020-08-31|     0.53333336| positive|
|@abcmelbourne mor...|2020-08-31|   -0.033333335|  neutral|
|#VICTORIA #Parlia...|2020-08-31|            0.0|  neutral|
|@profsarahj Sadly...|2020-08-31|       

In [156]:
SA_results.groupBy(['time','sentiment'])\
            .count()\
            .orderBy("time").show()

+----------+---------+-----+
|      time|sentiment|count|
+----------+---------+-----+
|2020-08-31|  neutral|   49|
|2020-08-31| negative|   12|
|2020-08-31| positive|   39|
+----------+---------+-----+

