# PySpark - Project Tweets

In [1]:
# Load the libraries
import os
import numpy as np
import pandas as pd
from pyspark.sql.functions import to_timestamp
from pyspark import SparkConf

from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer

## Initialize Spark Session

In [2]:
spark = SparkSession.builder.appName('project_tweets').getOrCreate()

24/05/10 10:33:08 WARN Utils: Your hostname, muhammad-Vm resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/05/10 10:33:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/10 10:33:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the Dataset

In [3]:
# Load the ProjectTweets into Hadoop in the named folder 'CA2BD'

data = spark.read.csv('hdfs://localhost:9000/user/hduser/CA2BD/New_Tweets.csv', header=True, inferSchema=True)




In [4]:
# Display the structure of schema
data.printSchema()

root
 |-- ids: long (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [5]:
num_rows = data.count()
print(f"Number of rows: {num_rows}")


num_columns = len(data.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 100000
Number of columns: 5


In [7]:
data.summary().show()

24/05/10 10:33:20 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+--------------------+--------------------+--------+-------------------+--------------------+
|summary|                 ids|                date|    flag|               user|                text|
+-------+--------------------+--------------------+--------+-------------------+--------------------+
|  count|              100000|              100000|  100000|             100000|              100000|
|   mean|  1.99847366845205E9|                NULL|    NULL|3.781014978571428E7|                NULL|
| stddev|1.9376617856586394E8|                NULL|    NULL|9.776457413992319E7|                NULL|
|    min|          1467812025|Fri Apr 17 20:31:...|NO_QUERY|           007peter|             i ju...|
|    25%|          1956824746|                NULL|    NULL|            78787.0|                NULL|
|    50%|          2002093382|                NULL|    NULL|           132057.0|                NULL|
|    75%|          2177004951|                NULL|    NULL|           892300.0|  

                                                                                

In [8]:
conf = SparkConf().set("spark.sql.legacy.timeParserPolicy", "LEGACY")
spark = spark.builder.config(conf=conf).getOrCreate()

In [9]:
tweets = data.withColumn("date", to_timestamp(data["date"], "EEE MMM dd HH:mm:ss zzz yyyy"))

In [10]:
tweets = tweets.select("ids", "date", "user", "text")

In [11]:
tweets.show(truncate=True, n=5)

+----------+-------------------+---------------+--------------------+
|       ids|               date|           user|                text|
+----------+-------------------+---------------+--------------------+
|1976753780|2009-05-31 02:58:12|       kimkins1|umm i kinda wanna...|
|1793549629|2009-05-14 11:07:11|       clinamen|testing #talkings...|
|1974118439|2009-05-30 20:28:45|      JohanCITI|I now have 2 twit...|
|2064596548|2009-06-07 14:41:36|MissDaisyTurner|I HAVE TO SAY IT....|
|2055168891|2009-06-06 16:49:43|      wrwarrick|got the feeds wor...|
+----------+-------------------+---------------+--------------------+
only showing top 5 rows



In [12]:
tweets.printSchema()

root
 |-- ids: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [13]:
from pyspark.sql.functions import year, month, count, format_string, col

In [14]:
df = tweets.groupBy(year("date").alias("year"), month("date").alias("month")).count() \
                 .orderBy(["year", "month"])


df = df.withColumn("percentage", format_string("%.2f%%", ((col("count")/tweets.count())*100)))


df.show(truncate=False)



+----+-----+-----+----------+
|year|month|count|percentage|
+----+-----+-----+----------+
|2009|4    |6336 |6.34%     |
|2009|5    |34595|34.60%    |
|2009|6    |59069|59.07%    |
+----+-----+-----+----------+



                                                                                

# Sentiment Analysis

## Logistic Regression Classifier Model

In [15]:
#import modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [16]:
#read csv file into dataFrame with automatically inferred schema
tweets_csv = spark.read.csv("hdfs://localhost:9000/user/hduser/CA2BD/tweets.csv", inferSchema=True, header=True)
tweets_csv.show(truncate=False, n=3)

+------+---------+---------------+---------------------------------+
|ItemID|Sentiment|SentimentSource|SentimentText                    |
+------+---------+---------------+---------------------------------+
|1038  |1        |Sentiment140   |that film is fantastic #brilliant|
|1804  |1        |Sentiment140   |this music is really bad #myband |
|1693  |0        |Sentiment140   |winter is terrible #thumbs-down  |
+------+---------+---------------+---------------------------------+
only showing top 3 rows



In [17]:
#select only "SentimentText" and "Sentiment" column, 
#and cast "Sentiment" column data into integer
data = tweets_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data.show(truncate = False,n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



In [18]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 1379 ; Testing data rows: 553


In [19]:
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [20]:
tokenizer = Tokenizer(inputCol="SentimentText", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(truncate=False, n=5)

+---------------------------------+-----+---------------------------------------+
|SentimentText                    |label|SentimentWords                         |
+---------------------------------+-----+---------------------------------------+
|I adore cheese #brilliant        |1    |[i, adore, cheese, #brilliant]         |
|I adore cheese #favorite         |1    |[i, adore, cheese, #favorite]          |
|I adore cheese #loveit           |1    |[i, adore, cheese, #loveit]            |
|I adore cheese #thumbs-up        |1    |[i, adore, cheese, #thumbs-up]         |
|I adore classical music #bestever|1    |[i, adore, classical, music, #bestever]|
+---------------------------------+-----+---------------------------------------+
only showing top 5 rows



In [21]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

+---------------------------------+-----+---------------------------------------+------------------------------------+
|SentimentText                    |label|SentimentWords                         |MeaningfulWords                     |
+---------------------------------+-----+---------------------------------------+------------------------------------+
|I adore cheese #brilliant        |1    |[i, adore, cheese, #brilliant]         |[adore, cheese, #brilliant]         |
|I adore cheese #favorite         |1    |[i, adore, cheese, #favorite]          |[adore, cheese, #favorite]          |
|I adore cheese #loveit           |1    |[i, adore, cheese, #loveit]            |[adore, cheese, #loveit]            |
|I adore cheese #thumbs-up        |1    |[i, adore, cheese, #thumbs-up]         |[adore, cheese, #thumbs-up]         |
|I adore classical music #bestever|1    |[i, adore, classical, music, #bestever]|[adore, classical, music, #bestever]|
+---------------------------------+-----+-------

In [22]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')
numericTrainData.show(truncate=False, n=3)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[1689,100089,108624],[1.0,1.0,1.0])|
|1    |[adore, cheese, #loveit]   |(262144,[1689,100089,254974],[1.0,1.0,1.0])|
+-----+---------------------------+-------------------------------------------+
only showing top 3 rows



## Train our classifier model using training data

In [23]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)
print ("Training is done!")

Training is done!


## Training the model

In [24]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'Label', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)

+-----+---------------------------+------------------------------------------+
|Label|MeaningfulWords            |features                                  |
+-----+---------------------------+------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[1689,91011,100089],[1.0,1.0,1.0])|
|1    |[adore, cheese, #toptastic]|(262144,[1689,42010,100089],[1.0,1.0,1.0])|
+-----+---------------------------+------------------------------------------+
only showing top 2 rows



## Predicting testing data

In [25]:
#import modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [26]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "MeaningfulWords", "prediction", "Label")
predictionFinal.show(n=4, truncate = False)
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)

+-------------------------------------+----------+-----+
|MeaningfulWords                      |prediction|Label|
+-------------------------------------+----------+-----+
|[adore, cheese, #bestever]           |1.0       |1    |
|[adore, cheese, #toptastic]          |1.0       |1    |
|[adore, classical, music, #brilliant]|1.0       |1    |
|[adore, classical, music, #favorite] |1.0       |1    |
+-------------------------------------+----------+-----+
only showing top 4 rows

correct prediction: 542 , total data: 553 , accuracy: 0.9801084990958409


## Tweets Dataset - cleaning

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace

In [28]:
# Definição de expressões regulares
at_regex = r"@\w+"  # Remove usernames
link_regex = r"http\S+"  # Remove links
rt_regex = r'\bRT\b'  # Remove 'RT'
ss_regex = r'[^\w\s]'  # Remove Special strings
ds_regex = r'\s+'  # Remove spaces

In [29]:
tweets= tweets.withColumn("clean_tweet", regexp_replace("text", at_regex, ""))
tweets= tweets.withColumn("clean_tweet", regexp_replace("clean_tweet", link_regex, ""))
tweets= tweets.withColumn("clean_tweet", regexp_replace("clean_tweet", rt_regex, ""))
tweets= tweets.withColumn("clean_tweet", regexp_replace("clean_tweet", ss_regex, ""))
tweets= tweets.withColumn("clean_tweet", regexp_replace("clean_tweet", ds_regex, " "))

# Exibição dos resultados
tweets.show(truncate=False, n=5)

+----------+-------------------+---------------+---------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+
|ids       |date               |user           |text                                                                                   |clean_tweet                                                                    |
+----------+-------------------+---------------+---------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+
|1976753780|2009-05-31 02:58:12|kimkins1       |umm i kinda wanna turn into a dinosaur and eat your face is that okay?? RAWR!!!    hehe|umm i kinda wanna turn into a dinosaur and eat your face is that okay RAWR hehe|
|1793549629|2009-05-14 11:07:11|clinamen       |testing #talkingshop (1,2)                                                          

In [30]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF

In [31]:
tokenizer = Tokenizer(inputCol="clean_tweet", outputCol="words")
tokenizedData = tokenizer.transform(tweets)

swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemoved = swr.transform(tokenizedData)

hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericData = hashTF.transform(SwRemoved).select('MeaningfulWords', 'features')


numericData.show(n=3)

+--------------------+--------------------+
|     MeaningfulWords|            features|
+--------------------+--------------------+
|[umm, kinda, wann...|(262144,[44094,88...|
|[testing, talking...|(262144,[7441,137...|
|[2, twitter, acco...|(262144,[1512,125...|
+--------------------+--------------------+
only showing top 3 rows



## Predictiong Data

In [32]:
#IMPORTING LIBRARIE
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark import SparkConf
from pyspark.sql.functions import to_timestamp


import warnings
warnings.filterwarnings('ignore')

In [33]:
prediction = model.transform(numericData)

predictionFinal = prediction.select(
    "MeaningfulWords", "prediction")

In [34]:
predictionFinal.show(truncate = False, n=10)

+-----------------------------------------------------------------------------------------+----------+
|MeaningfulWords                                                                          |prediction|
+-----------------------------------------------------------------------------------------+----------+
|[umm, kinda, wanna, turn, dinosaur, eat, face, okay, rawr, hehe]                         |0.0       |
|[testing, talkingshop, 12]                                                               |0.0       |
|[2, twitter, accounts, use]                                                              |0.0       |
|[say, zac, efron, dream, boy, ok, said]                                                  |0.0       |
|[got, feeds, working, website, time, rejoin, world]                                      |0.0       |
|[, finale, one, bungees, broke, emergency, silkpink, rope, dropped, didnt, finish, meant]|0.0       |
|[summer, vacation, fun]                                                 

In [35]:
predictionFinal.count()

100000

In [36]:
# Create a column with id following the data's order 
tweets = tweets.withColumn("row_id", monotonically_increasing_id())
predictionFinal = predictionFinal.withColumn("row_id", monotonically_increasing_id())

# join by "row_id"
tweets_pred = tweets.select('row_id','date','user', 'text', 'clean_tweet') \
                    .join(predictionFinal.select('row_id', 'prediction'), "row_id", "inner")
                

# drop column 
tweets_pred = tweets_pred.drop("row_id")

tweets_pred.show()



+-------------------+---------------+--------------------+--------------------+----------+
|               date|           user|                text|         clean_tweet|prediction|
+-------------------+---------------+--------------------+--------------------+----------+
|2009-05-31 02:58:12|       kimkins1|umm i kinda wanna...|umm i kinda wanna...|       0.0|
|2009-05-14 11:07:11|       clinamen|testing #talkings...|testing talkingsh...|       0.0|
|2009-05-30 20:28:45|      JohanCITI|I now have 2 twit...|I now have 2 twit...|       0.0|
|2009-06-07 14:41:36|MissDaisyTurner|I HAVE TO SAY IT....|I HAVE TO SAY IT ...|       0.0|
|2009-06-06 16:49:43|      wrwarrick|got the feeds wor...|got the feeds wor...|       0.0|
|2009-06-17 13:19:24|       tamjay17|@cazzlar in the f...| in the finale on...|       0.0|
|2009-06-01 17:03:30|      mohehnick|Summer vacation i...|Summer vacation i...|       1.0|
|2009-06-15 11:46:06|    DanielTeixe|@joannalevesque H...| Hi JoJo Thanks f...|       0.0|

                                                                                

## Textblod and Varder

In [37]:
from textblob import TextBlob

In [38]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/hduser/nltk_data...


In [39]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

# Função de análise de sentimento com TextBlob
@udf(FloatType())
def sentiment(tweet):
    return TextBlob(tweet).sentiment.polarity

# Função de análise de sentimento com VADER
@udf(FloatType())
def sentiment_vader(tweet):
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(tweet)['compound']

# Aplicar diretamente no DataFrame
tweets_pred = tweets_pred.withColumn("textblob", sentiment(tweets_pred["clean_tweet"])) \
                         .withColumn("vader", sentiment_vader(tweets_pred["clean_tweet"]))

In [40]:
tweets_pred = tweets_pred.withColumn("score", ((col("prediction") + (col("textblob")*1.5) + (col("vader")*1.5)) / 4))

In [41]:
tweets_pred.select("clean_tweet", "prediction", "textblob", "vader", "score").show(n=10)

[Stage 62:>                                                         (0 + 1) / 1]

+--------------------+----------+----------+-------+--------------------+
|         clean_tweet|prediction|  textblob|  vader|               score|
+--------------------+----------+----------+-------+--------------------+
|umm i kinda wanna...|       0.0|       0.5| 0.2263| 0.27236250042915344|
|testing talkingsh...|       0.0|       0.0|    0.0|                 0.0|
|I now have 2 twit...|       0.0|       0.0|    0.0|                 0.0|
|I HAVE TO SAY IT ...|       0.0|       0.5| 0.4939| 0.37271250039339066|
|got the feeds wor...|       0.0|       0.0|    0.0|                 0.0|
| in the finale on...|       0.0|       0.0|-0.6597|-0.24738749116659164|
|Summer vacation i...|       1.0|       0.3| 0.5106|  0.5539749935269356|
| Hi JoJo Thanks f...|       0.0|       0.2| 0.8316| 0.38685000501573086|
|time flies when u...|       0.0|       0.0| -0.296|-0.11100000143051147|
| fair enough long...|       0.0|0.19659092| 0.2023|  0.1495840921998024|
+--------------------+----------+-----

                                                                                

In [42]:
tweets_pred.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- clean_tweet: string (nullable = true)
 |-- prediction: double (nullable = false)
 |-- textblob: float (nullable = true)
 |-- vader: float (nullable = true)
 |-- score: double (nullable = true)



## Saving on Hadoop

In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, dayofmonth
from hdfs import InsecureClient
from hdfs.util import HdfsError

def spark_hadoop(df, folder, partitionBy=None, spark=None):
    hdfs_base_path = "hdfs://localhost:9000"
    hdfs_folder_path = f"{hdfs_base_path}/CA2BD/{folder}"

    client = InsecureClient('http://localhost:9870', user='hduser')

    try:
       
        client.content(hdfs_folder_path)

        print('Os arquivos já estão no Hadoop. Lendo os arquivos.')
        df = spark.read.parquet(hdfs_folder_path)
    except HdfsError:
        print('Colocando no Hadoop.')
        if partitionBy:
            
            df = df.withColumn("year", year("date"))
            df = df.withColumn("month", month("date"))
            df = df.withColumn("day", dayofmonth("date"))
            
           
            df.write.partitionBy("year", "month", "day").parquet(hdfs_folder_path)
            print(f"Salvo em {hdfs_folder_path} particionado por {partitionBy}")
        else:
            df.write.parquet(hdfs_folder_path)
            print(f"Salvo em {hdfs_folder_path}")

    return df



df_salvo = spark_hadoop(tweets_pred, folder="sentiment", partitionBy="date", spark=spark)

Colocando no Hadoop.


                                                                                

Salvo em hdfs://localhost:9000/CA2BD/sentiment particionado por date


In [48]:
df_salvo.show(2)

[Stage 69:>                                                         (0 + 1) / 1]

+-------------------+--------+--------------------+--------------------+----------+--------+------+-------------------+----+-----+---+
|               date|    user|                text|         clean_tweet|prediction|textblob| vader|              score|year|month|day|
+-------------------+--------+--------------------+--------------------+----------+--------+------+-------------------+----+-----+---+
|2009-05-31 02:58:12|kimkins1|umm i kinda wanna...|umm i kinda wanna...|       0.0|     0.5|0.2263|0.27236250042915344|2009|    5| 31|
|2009-05-14 11:07:11|clinamen|testing #talkings...|testing talkingsh...|       0.0|     0.0|   0.0|                0.0|2009|    5| 14|
+-------------------+--------+--------------------+--------------------+----------+--------+------+-------------------+----+-----+---+
only showing top 2 rows



                                                                                