In [12]:
import config
import pyspark
import vaderSentiment.vaderSentiment

# Creating Spark session

In [13]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Fetching Data

In [23]:
df = ( spark.read
    .format('mongodb')
    .option('database'      , f'{config.MongoDb.database}')
    .option('collection'    , f'{config.MongoDb.readCollection}')
    .option('connection.uri', f'mongodb://{config.MongoDb.user}:{config.MongoDb.password}@{config.MongoDb.host}:{config.MongoDb.port}')
	.load()
)
df.show(10, truncate= 100)

+------------------------+-------------------+--------------------------------------------------------------------------------+---------------+
|                     _id|          createdAt|                                                                            text|       userName|
+------------------------+-------------------+--------------------------------------------------------------------------------+---------------+
|664a236ca9497c29d7109e92|2009-04-06 22:19:45|       user http - awww , 's bummer . you shoulda got david carr third day . ; d|_TheSpecialOne_|
|664a236ca9497c29d7109e93|2009-04-06 22:19:49|upset ca n't updat facebook text ... might cri result school today also . blah !|  scotthamilton|
|664a236ca9497c29d7109e94|2009-04-06 22:19:53|                  user i dive mani time ball . manag save 50 % the rest go bound|       mattycus|
|664a236ca9497c29d7109e95|2009-04-06 22:19:57|                                                 whole bodi feel itchi like fire|        E

# Sentiment extraction

## using pretrained VADER Sentiment model

In [None]:
sentimentIntensityAnalyzer = vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer()

## Defining pyspark function to perform sentiment extraction

In [None]:
@pyspark.sql.functions.udf(
    returnType= pyspark.sql.types.MapType(
        pyspark.sql.types.StringType(), 
        pyspark.sql.types.FloatType()
))
def sentiment(text2):
    return sentimentIntensityAnalyzer.polarity_scores(text2)

# Performing sentiment Extraction

In [25]:
df = df.withColumn(
    '*',
    sentiment('text').alias('sentiment')
)
df.show(10, truncate= 100)

+------------------------+-------------------+--------------------------------------------------------------------------------+---------------+-------------------------------------------------------------+
|                     _id|          createdAt|                                                                            text|       userName|                                                    sentiment|
+------------------------+-------------------+--------------------------------------------------------------------------------+---------------+-------------------------------------------------------------+
|664a236ca9497c29d7109e92|2009-04-06 22:19:45|       user http - awww , 's bummer . you shoulda got david carr third day . ; d|_TheSpecialOne_|{neg -> 0.133, pos -> 0.0, compound -> -0.3818, neu -> 0.867}|
|664a236ca9497c29d7109e93|2009-04-06 22:19:49|upset ca n't updat facebook text ... might cri result school today also . blah !|  scotthamilton|{neg -> 0.235, pos -> 0.0, compou

# Analyzing resultant data Schema

In [26]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- createdAt: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- userName: string (nullable = true)
 |-- sentiment: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)



# Transforming schema into a flattened form

In [27]:
df = df.select(
    '*',
    pyspark.sql.functions.col('sentiment').getItem('neg').alias('negative'),
    pyspark.sql.functions.col('sentiment').getItem('neu').alias('neutral'),
    pyspark.sql.functions.col('sentiment').getItem('pos').alias('positive'),
    pyspark.sql.functions.col('sentiment').getItem('compound').alias('compound')
)
df.show(10)

+--------------------+-------------------+--------------------+---------------+--------------------+--------+-------+--------+--------+
|                 _id|          createdAt|                text|       userName|           sentiment|negative|neutral|positive|compound|
+--------------------+-------------------+--------------------+---------------+--------------------+--------+-------+--------+--------+
|664a236ca9497c29d...|2009-04-06 22:19:45|user http - awww ...|_TheSpecialOne_|{neg -> 0.133, po...|   0.133|  0.867|     0.0| -0.3818|
|664a236ca9497c29d...|2009-04-06 22:19:49|upset ca n't upda...|  scotthamilton|{neg -> 0.235, po...|   0.235|  0.765|     0.0| -0.5093|
|664a236ca9497c29d...|2009-04-06 22:19:53|user i dive mani ...|       mattycus|{neg -> 0.0, pos ...|     0.0|  0.814|   0.186|  0.4939|
|664a236ca9497c29d...|2009-04-06 22:19:57|whole bodi feel i...|        ElleCTF|{neg -> 0.27, pos...|    0.27|  0.449|   0.281|  0.0258|
|664a236ca9497c29d...|2009-04-06 22:19:57|user ,

In [28]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- createdAt: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- userName: string (nullable = true)
 |-- sentiment: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)
 |-- negative: float (nullable = true)
 |-- neutral: float (nullable = true)
 |-- positive: float (nullable = true)
 |-- compound: float (nullable = true)



# Dropping irrelavant features

In [30]:
df = df.drop('_id', 'text', 'sentiment')
df.printSchema()

root
 |-- createdAt: timestamp (nullable = true)
 |-- userName: string (nullable = true)
 |-- negative: float (nullable = true)
 |-- neutral: float (nullable = true)
 |-- positive: float (nullable = true)
 |-- compound: float (nullable = true)



# Writing processed text to MongoDb

In [9]:
( df.write
    .format('mongodb')
    .option('database'      , f'{config.MongoDb.database}')
    .option('collection'    , f'{config.MongoDb.writeCollection}')
    .option('connection.uri', f'mongodb://{config.MongoDb.user}:{config.MongoDb.password}@{config.MongoDb.host}:{config.MongoDb.port}')
    .mode('overwrite')
	.save()
)

                                                                                