In [16]:
import os
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd

In [3]:
conf = pyspark.SparkConf()
conf.set('spark.ui.proxyBase', '/user/' + os.environ['JUPYTERHUB_USER'] + '/proxy/4041')
conf.set('spark.driver.memory','8g')
conf.set('spark.ui.showConsoleProgress', False)
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/26 12:51:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### First Dataset

In [67]:
twitter_data = spark.read.csv('./data/Twitter_Data.csv', multiLine=True, header=True).select(['clean_text', 'category'])

In [68]:
twitter_data.count()

107760

In [69]:
twitter_data.printSchema()
twitter_data.show(10)

root
 |-- clean_text: string (nullable = true)
 |-- category: string (nullable = true)

+--------------------+--------+
|          clean_text|category|
+--------------------+--------+
|when modi promise...|    -1.0|
|what did just say...|     1.0|
|asking his suppor...|     1.0|
|answer who among ...|     1.0|
|with upcoming ele...|     1.0|
|gandhi was gay do...|     1.0|
|things like demon...|     1.0|
|hope tuthukudi pe...|     1.0|
|calm waters where...|     1.0|
|vote such party a...|    -1.0|
+--------------------+--------+
only showing top 10 rows



In [70]:
twitter_data = twitter_data.withColumnRenamed('clean_text', 'text')

Checking for NULL rows

In [71]:
twitter_data = twitter_data.filter(~(F.col('category').isNull() | F.col('text').isNull()))

This DataFrame has 2 categories:

-1 $\Rightarrow$ Negative sentiment <br>
1 $\Rightarrow$ Positive sentiment

In [72]:
twitter_data.select('category').distinct().show()

+--------+
|category|
+--------+
|     1.0|
|    -1.0|
+--------+



### Second Dataset

In [56]:
sentiment140Schema = T.StructType([
    T.StructField("target", T.StringType(), True),        
    T.StructField("id", T.StringType(), True),
    T.StructField("date", T.StringType(), True),
    T.StructField("flag", T.StringType(), True),
    T.StructField("user", T.StringType(), True),
    T.StructField("text", T.StringType(), True),
])
sentiment140_data = spark.read.csv('./data/sentiment140.csv', schema=sentiment140Schema).select(['text', 'target'])

In [57]:
sentiment140_data.printSchema()
sentiment140_data.columns

root
 |-- text: string (nullable = true)
 |-- target: string (nullable = true)



['text', 'target']

This DataFrame has 2 categories:

0 $\Rightarrow$ Negative sentiment <br>
4 $\Rightarrow$ Positive sentiment

In [58]:
sentiment140_data.select('target').distinct().show()

+------+
|target|
+------+
|     0|
|     4|
+------+

