In [1]:
import pandas as pd
import ijson
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

In [2]:
time = []
text = []
city = []
location =[]
# with open(f’{tmp}_data.json’, ‘r’, encoding=’utf-8’) as f:
with open('data.json', 'r') as f:
    objects = ijson.items(f, 'results.item')
    for row in objects:
        if row['text'].startswith("RT") == False:
            city.append(row['place']['name'])
            location.append(row['coordinates'])
            time.append(row['created_at'])
            text.append(row['text'])

data = pd.DataFrame({"text":text,"time":time,"city":city,"location":location},columns=["text","time","city","location"])
data['time'] = pd.to_datetime(data['time']).dt.date

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
# from textblob import TextBlob
from pyspark.sql import SQLContext

In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re

# remove non ASCII characters
def strip_non_ascii(data_str):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in data_str if 0 < ord(c) < 127)
    return ''.join(stripped)
# setup pyspark udf function
strip_non_ascii_udf = udf(strip_non_ascii, StringType())


In [5]:
# import findspark
# findspark.init()
spark = SparkSession.builder.appName('NLP').getOrCreate()

In [6]:
df = spark.createDataFrame(data)
df = df.withColumn('text_non_asci',strip_non_ascii_udf(df['text']))
df.show()

+--------------------+----------+---------+--------+--------------------+
|                text|      time|     city|location|       text_non_asci|
+--------------------+----------+---------+--------+--------------------+
|Brilliant thread....|2020-08-31|Melbourne|    null|Brilliant thread....|
|@TimWilsonMP Have...|2020-08-31|Melbourne|    null|@TimWilsonMP Have...|
|@Rdene915 @Christ...|2020-08-31|Melbourne|    null|@Rdene915 @Christ...|
|A photo of you in...|2020-08-31|Melbourne|    null|A photo of you in...|
|@IndyCat14 @JoyOf...|2020-08-31|Melbourne|    null|@IndyCat14 @JoyOf...|
|@BuckleyIOP It re...|2020-08-31|Melbourne|    null|@BuckleyIOP It re...|
|@kruevans But you...|2020-08-31|Melbourne|    null|@kruevans But you...|
|@johniadarola I d...|2020-08-31|Melbourne|    null|@johniadarola I d...|
|My sister turns 7...|2020-08-31|Melbourne|    null|My sister turns 7...|
|@kruevans It’s sh...|2020-08-31|Melbourne|    null|@kruevans Its shi...|
|@runwader Cannot ...|2020-08-31|Melbo

In [7]:
# fixed abbreviation
def fix_abbreviation(data_str):
    data_str = data_str.lower()
    data_str = re.sub(r'\bthats\b', 'that is', data_str)
    data_str = re.sub(r'\bive\b', 'i have', data_str)
    data_str = re.sub(r'\bim\b', 'i am', data_str)
    data_str = re.sub(r'\bya\b', 'yeah', data_str)
    data_str = re.sub(r'\bcant\b', 'can not', data_str)
    data_str = re.sub(r'\bdont\b', 'do not', data_str)
    data_str = re.sub(r'\bwont\b', 'will not', data_str)
    data_str = re.sub(r'\bid\b', 'i would', data_str)
    data_str = re.sub(r'wtf', 'what the fuck', data_str)
    data_str = re.sub(r'\bwth\b', 'what the hell', data_str)
    data_str = re.sub(r'\br\b', 'are', data_str)
    data_str = re.sub(r'\bu\b', 'you', data_str)
    data_str = re.sub(r'\bk\b', 'OK', data_str)
    data_str = re.sub(r'\bsux\b', 'sucks', data_str)
    data_str = re.sub(r'\bno+\b', 'no', data_str)
    data_str = re.sub(r'\bcoo+\b', 'cool', data_str)
    data_str = re.sub(r'rt\b', '', data_str)
    data_str = data_str.strip()
    return data_str

fix_abbreviation_udf = udf(fix_abbreviation, StringType())

In [8]:
df = df.withColumn('fixed_abbrev',fix_abbreviation_udf(df['text_non_asci']))
df.show(5,True)

+--------------------+----------+---------+--------+--------------------+--------------------+
|                text|      time|     city|location|       text_non_asci|        fixed_abbrev|
+--------------------+----------+---------+--------+--------------------+--------------------+
|Brilliant thread....|2020-08-31|Melbourne|    null|Brilliant thread....|brilliant thread....|
|@TimWilsonMP Have...|2020-08-31|Melbourne|    null|@TimWilsonMP Have...|@timwilsonmp have...|
|@Rdene915 @Christ...|2020-08-31|Melbourne|    null|@Rdene915 @Christ...|@rdene915 @christ...|
|A photo of you in...|2020-08-31|Melbourne|    null|A photo of you in...|a photo of you in...|
|@IndyCat14 @JoyOf...|2020-08-31|Melbourne|    null|@IndyCat14 @JoyOf...|@indycat14 @joyof...|
+--------------------+----------+---------+--------+--------------------+--------------------+
only showing top 5 rows



In [13]:
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 1 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    # remove unwanted space, *.split() will automatically split on
    # whitespace and discard duplicates, the " ".join() joins the
    # resulting list into one string.
    return " ".join(cleaned_str.split())
# setup pyspark udf function
remove_features_udf = udf(remove_features, StringType())

In [15]:
df = df.withColumn('removed',remove_features_udf(df['fixed_abbrev']))
df.show(5,True)

+--------------------+----------+---------+--------+--------------------+--------------------+--------------------+
|                text|      time|     city|location|       text_non_asci|        fixed_abbrev|             removed|
+--------------------+----------+---------+--------+--------------------+--------------------+--------------------+
|Brilliant thread....|2020-08-31|Melbourne|    null|Brilliant thread....|brilliant thread....|brilliant thread ...|
|@TimWilsonMP Have...|2020-08-31|Melbourne|    null|@TimWilsonMP Have...|@timwilsonmp have...|have the courage ...|
|@Rdene915 @Christ...|2020-08-31|Melbourne|    null|@Rdene915 @Christ...|@rdene915 @christ...|build it and they...|
|A photo of you in...|2020-08-31|Melbourne|    null|A photo of you in...|a photo of you in...|photo of you in j...|
|@IndyCat14 @JoyOf...|2020-08-31|Melbourne|    null|@IndyCat14 @JoyOf...|@indycat14 @joyof...|                    |
+--------------------+----------+---------+--------+--------------------

In [16]:
from pyspark.sql.types import FloatType

from textblob import TextBlob

def sentiment_analysis(text):

    return TextBlob(text).sentiment.polarity

sentiment_analysis_udf = udf(sentiment_analysis , FloatType())

In [17]:
df  = df.withColumn("sentiment_score", sentiment_analysis_udf( df['removed'] ))
df.show(5,True)

+--------------------+----------+---------+--------+--------------------+--------------------+--------------------+---------------+
|                text|      time|     city|location|       text_non_asci|        fixed_abbrev|             removed|sentiment_score|
+--------------------+----------+---------+--------+--------------------+--------------------+--------------------+---------------+
|Brilliant thread....|2020-08-31|Melbourne|    null|Brilliant thread....|brilliant thread....|brilliant thread ...|     0.53333336|
|@TimWilsonMP Have...|2020-08-31|Melbourne|    null|@TimWilsonMP Have...|@timwilsonmp have...|have the courage ...|            0.0|
|@Rdene915 @Christ...|2020-08-31|Melbourne|    null|@Rdene915 @Christ...|@rdene915 @christ...|build it and they...|            0.0|
|A photo of you in...|2020-08-31|Melbourne|    null|A photo of you in...|a photo of you in...|photo of you in j...|            0.0|
|@IndyCat14 @JoyOf...|2020-08-31|Melbourne|    null|@IndyCat14 @JoyOf...|@in

In [None]:
def condition(r):
    if (r >=0.1):
        label = "positive"
    elif(r <= -0.1):
        label = "negative"
    else:
        label = "neutral"
    return label

sentiment_udf = udf(lambda x: condition(x), StringType())
df  = df.withColumn("sentiment", sentiment_udf( df['sentiment_score'] ))
SA_results = df.select('text','time','sentiment_score','sentiment')

In [None]:
SA_results.show(90)

In [None]:
SA_results.groupBy(['time','sentiment'])\
            .count()\
            .orderBy("time").show()