# Chapter 10: Spark Streaming
## Ex1: Pre-processing Data from Tweets
### Requirement:
- Read data from file (Tweets)
- Pre-process data
- Save data after pre-processing to new file.

In [1]:
# pip install textblob
import csv
from textblob import TextBlob

### SENTIMENT ANALYSIS
- Sentiment polarity for an element defines the orientation of the expressed sentiment, i.e., it determines if the text expresses the positive, negative or neutral sentiment of the user about the entity in consideration.
- Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].

- https://planspace.org/20150607-textblob_sentiment/
- https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/#:~:text=Polarity%20is%20float%20which%20lies,of%20%5B0%2C1%5D.

In [2]:
TextBlob("Today is a good day!").sentiment

Sentiment(polarity=0.875, subjectivity=0.6000000000000001)

In [3]:
TextBlob("Today is not a good day and he doesn't want to go to shool.").sentiment

Sentiment(polarity=-0.35, subjectivity=0.6000000000000001)

In [4]:
# Translation and Language Detection
blob = TextBlob("Sáng nay trời u ám quá")

In [5]:
blob.detect_language()

'vi'

In [6]:
blob.translate(from_lang='vi', to ='en')

TextBlob("It's so overcast this morning")

### PRE-PROCESSING TEXT DATA

In [7]:
import pandas as pd

In [8]:
tweetdata = '../../Data/tweets_covid_19.txt'
sentences = []
sentiment_polarity = []
sentiment_subjectivity = []

In [9]:
with open(tweetdata, 'r') as csvfile:
    rows = csv.reader(csvfile)
    for row in rows:
        sentence = row[0]
        blob = TextBlob(sentence)
        if ("Error on_data" not in sentence):
            print (sentence)
            print (blob.sentiment.polarity, blob.sentiment.subjectivity)
            sentences.append(sentence)
            sentiment_polarity.append(blob.sentiment.polarity)
            sentiment_subjectivity.append(blob.sentiment.subjectivity)

Listening on port: 5555
0.0 0.0
Received request from: ('127.0.0.1'
-0.75 1.0
b'RT @VincentRK: A lot of COVID deaths in India in young people who should be doing well &amp; recovering. \n\nI am making a plea to doctors in Ind\xe2\x80\xa6'
0.1 0.4
b'Son ideas m\xc3\xadas o ya todo est\xc3\xa1 normal aqu\xc3\xad?? El covid?'
0.15 0.6499999999999999
b'Welp
0.0 0.0
b'RT @RogerMarshallMD: Last Jan
0.0 0.06666666666666667
b'RT @RealEspartaB: Yo no quiero que Zidane se vaya. Siento que es la persona id\xc3\xb3nea para el puesto de entrenador del Madrid con todo y sus e\xe2\x80\xa6'
0.0 0.0
b'RT @nvcmenon: Pinki Aggarwal O+ve Covid patient admitted in \nGandhi Nursing Home\nUttam Nagar Delhi urgently needs plasma from recovered Cov\xe2\x80\xa6'
0.0 0.0
b'RT @EIJefeDiego: El mal llamado presidente argument\xc3\xb3 que la vacuna contra COVID-19 no ser\xc3\xada distribuida por la IP para garantizar que acces\xe2\x80\xa6'
0.0 0.0
b"RT @MajorPoonia: Listen to India\xe2\x80\x99s pride and world's re

In [10]:
data = pd.DataFrame({"sentence": sentences, 
                     "sentiment_polarity":sentiment_polarity,
                     "sentiment_subjectivity":sentiment_subjectivity
                    })

In [11]:
data = data.drop([0, 1])

In [12]:
data.sentence = data.sentence.str.replace("b'", "")

In [13]:
data.head()

Unnamed: 0,sentence,sentiment_polarity,sentiment_subjectivity
2,RT @VincentRK: A lot of COVID deaths in India ...,0.1,0.4
3,Son ideas m\xc3\xadas o ya todo est\xc3\xa1 no...,0.15,0.65
4,Welp,0.0,0.0
5,RT @RogerMarshallMD: Last Jan,0.0,0.066667
6,RT @RealEspartaB: Yo no quiero que Zidane se v...,0.0,0.0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4224 entries, 2 to 4225
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sentence                4224 non-null   object 
 1   sentiment_polarity      4224 non-null   float64
 2   sentiment_subjectivity  4224 non-null   float64
dtypes: float64(2), object(1)
memory usage: 132.0+ KB


In [15]:
# data.to_csv("tweets_covid_19_2310.csv")

### Another solution: Build function to read txt file and convert to csv file

In [16]:
def read_and_pre_pro(file_in, file_out):
    sentences = []
    sentiment_polarity = []
    sentiment_subjectivity = []
    with open(file_in, 'r') as csvfile:
        rows = csv.reader(csvfile)
        for row in rows:
            sentence = row[0]
            blob = TextBlob(sentence)
            if ("Error on_data" not in sentence):
                #print (sentence)
                #print (blob.sentiment.polarity, blob.sentiment.subjectivity)
                sentences.append(sentence)
                sentiment_polarity.append(blob.sentiment.polarity)
                sentiment_subjectivity.append(blob.sentiment.subjectivity)
        data = pd.DataFrame({"sentence": sentences, 
                     "sentiment_polarity":sentiment_polarity,
                     "sentiment_subjectivity":sentiment_subjectivity
                    })        
        data.sentence = data.sentence.str.replace("b'", "")
        
        data.to_csv(file_out)

In [18]:
file_in = "../../Data/tweets_covid_19.txt"
file_out = "tweets_covid_19_2310.csv"
read_and_pre_pro(file_in, file_out)

In [19]:
df = pd.read_csv("tweets_covid_19_2310.csv", index_col=0)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4226 entries, 0 to 4225
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sentence                4226 non-null   object 
 1   sentiment_polarity      4226 non-null   float64
 2   sentiment_subjectivity  4226 non-null   float64
dtypes: float64(2), object(1)
memory usage: 132.1+ KB


In [21]:
df.head()

Unnamed: 0,sentence,sentiment_polarity,sentiment_subjectivity
0,Listening on port: 5555,0.0,0.0
1,Received request from: ('127.0.0.1',-0.75,1.0
2,RT @VincentRK: A lot of COVID deaths in India ...,0.1,0.4
3,Son ideas m\xc3\xadas o ya todo est\xc3\xa1 no...,0.15,0.65
4,Welp,0.0,0.0


In [22]:
indexNames = df[df['sentence'].str.contains("Listening on port")].index 
# Delete these row indexes from dataFrame
df = df.drop(indexNames)

In [23]:
indexNames = df[df['sentence'].str.contains("Received request from")].index 
# Delete these row indexes from dataFrame
df = df.drop(indexNames)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4224 entries, 2 to 4225
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sentence                4224 non-null   object 
 1   sentiment_polarity      4224 non-null   float64
 2   sentiment_subjectivity  4224 non-null   float64
dtypes: float64(2), object(1)
memory usage: 132.0+ KB


In [25]:
df.head()

Unnamed: 0,sentence,sentiment_polarity,sentiment_subjectivity
2,RT @VincentRK: A lot of COVID deaths in India ...,0.1,0.4
3,Son ideas m\xc3\xadas o ya todo est\xc3\xa1 no...,0.15,0.65
4,Welp,0.0,0.0
5,RT @RogerMarshallMD: Last Jan,0.0,0.066667
6,RT @RealEspartaB: Yo no quiero que Zidane se v...,0.0,0.0
