In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install afinn
!pip install pyspark

Collecting afinn
[?25l  Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)
[K     |██████▎                         | 10kB 14.5MB/s eta 0:00:01[K     |████████████▌                   | 20kB 12.8MB/s eta 0:00:01[K     |██████████████████▊             | 30kB 9.3MB/s eta 0:00:01[K     |█████████████████████████       | 40kB 7.2MB/s eta 0:00:01[K     |███████████████████████████████▏| 51kB 4.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.9MB/s 
[?25hBuilding wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25l[?25hdone
  Created wheel for afinn: filename=afinn-0.1-cp37-none-any.whl size=53451 sha256=4162b4bca95b37c7d33b8184a9054af7901dce78dcba202c68c9fc4680b28925
  Stored in directory: /root/.cache/pip/wheels/b5/1c/de/428301f3333ca509dcf20ff358690eb23a1388fbcbbde008b2
Successfully built afinn
Installing collected packages: afinn
Succe

In [None]:
from pyspark.sql import SparkSession as ss
from pyspark.sql.functions import udf
from pyspark.ml.feature import MaxAbsScaler
# from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import FloatType,StringType
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn
from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')
spark = ss.builder.getOrCreate()



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
class lex_anal:
  def __init__(self):
      # self.f = file
      self.path_read = '/content/drive/My Drive/Preprocessed/New/'
      self.path_write = '/content/drive/My Drive/LATEST'
      # self.df = spark.read.option("header","true").csv(self.path_read+self.f,inferSchema = True)

  
  def process(self,df):
      df = self.process_score(df)
      df = self.scale(df)
      df = self.process_class(df)
      self.save(df)
  
  
  @staticmethod
  @udf(returnType=FloatType())
  def vader_pol(text):
      vader = SentimentIntensityAnalyzer()
      return dict(vader.polarity_scores(text))['compound']

  @staticmethod
  @udf(returnType=FloatType())
  def afinn_pol(text):
      af = Afinn()
      return af.score(text)

  @staticmethod
  @udf(returnType = FloatType())
  def blob_pol(text):
      return TextBlob(text).polarity

  def process_score(self,df):
      df = df.withColumn('vader_score',lex_anal.vader_pol('pre_text_vader'))
      df = df.withColumn('afinn_score',lex_anal.afinn_pol('pre_text_all_upd'))
      df = df.withColumn('blob_score',lex_anal.blob_pol('pre_text_all_upd'))
      return df

  def process_class(self,df):
      df = df.withColumn('vader_class',lex_anal.classify('vader_score'))
      df = df.withColumn('afinn_class',lex_anal.classify('afinn_score'))
      df = df.withColumn('blob_class',lex_anal.classify('blob_score'))
      return df
  
  def save(self,df):
      df.write.mode("overwrite").option("header","true").csv(self.path_write+'covid_dataset')


  def scale(self,df):
      columns_to_scale = ["afinn_score"]
      assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
      scalers = [MaxAbsScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
      pipeline = Pipeline(stages=assemblers + scalers)
      scalerModel = pipeline.fit(df)
      scaledData = scalerModel.transform(df)
    # scaledData = scaledData.drop('afinn_score_vec')
      unlist = udf(lambda x: float(list(x)[0]), FloatType())
      scaledData = scaledData.withColumn('afinn_score_scaled_f',unlist('afinn_score_scaled'))
      scaledData = scaledData.drop('afinn_score_scaled','afinn_score_vec','afinn_score')
      scaledData = scaledData.withColumnRenamed('afinn_score_scaled_f','afinn_score')
      df = scaledData
      return df


  @staticmethod
  @udf(returnType = StringType())
  def classify(score):
      if score>0.5:
          return 'VPos'
      if score>0 and score<=0.5:
          return 'Pos'
      if score<0 and score>=-0.5:
          return 'Neg'
      if score<-0.5:
          return 'VNeg'
      return 'Neu'



In [None]:
l = lex_anal()
l.process(df_sp)

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Preprocessed/New/total_combined_text_and_ids_preprocessed.csv')

In [None]:
df_sp

DataFrame[text: string, pre_text_vader: string, pre_text_all_upd: string]

In [None]:
from pyspark.sql.types import *

mySchema = StructType([ StructField("text", StringType(), True),StructField("pre_text_vader", StringType(), True),StructField("pre_text_all_upd", StringType(), True)])
df_sp = spark.createDataFrame(df,schema=mySchema)

In [None]:
df_sp.write.option("header","true").csv('/content/drive/My Drive/Preprocessed/New/total_combined_text_and_ids_preprocessed')

In [None]:
spark = ss.builder.getOrCreate()
df = spark.read.option("header","true").csv('/content/drive/My Drive/Preprocessed/New/lex_sen/Sentiment_Labelled_Sentences_Data_Set_cleaned_preprocessed',inferSchema = True)

In [None]:
import pandas as pd
df2 = pd.read_csv('/content/drive/My Drive/Preprocessed/New/lex_sen/Sentiment_Labelled_Sentences_Data_Set_cleaned_preprocessed/part-00000-0b519276-7e50-48c0-b6f1-d16eb8b369cb-c000.csv')

In [None]:
df1 = df.select("vader_score","afinn_score","blob_score")

In [None]:
unlist = udf(lambda x: float(list(x)[0]), FloatType())
df1 = df1.withColumn("vader_score",unlist("vader_score"))
df1 = df1.withColumn("afinn_score",unlist("afinn_score"))
df1 = df1.withColumn("blob_score",unlist("blob_score"))

In [None]:
columns_to_scale = ["afinn_score"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MaxAbsScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df)
scaledData = scalerModel.transform(df)

In [None]:
scaledData.drop('vader_score','vader_class')

DataFrame[text: string, pre_text_vader: string, pre_text_all_upd: string, afinn_score: double, afinn_class: string, blob_score: double, blob_class: string, afinn_score_vec: vector, afinn_score_scaled: vector]

In [None]:
import os

In [None]:
os.chdir('/content/drive/My Drive/Preprocessed/New/')

In [None]:
files = [file for file in os.listdir() if file.endswith('preprocessed')]

In [None]:
files

['sentiment140_cleaned_csv_preprocessed',
 'Sentiment_Analysis_in_Text_cleaned.csv_preprocessed',
 'IMDB_cleaned.csv_preprocessed',
 'The_Valence_and_Arousal_Facebook_Posts_cleaned_preprocessed',
 'Primary_Emotions_of_Statements_cleaned_preprocessed',
 'Sentiment_Emotion_Mining_Toolkit_(EMTk)_cleaned_preprocessed',
 'Brands_and_Product_Emotions_cleaned.csv_preprocessed',
 'WASSA_2017_Shared_Task_on_Emotion_Intensity_cleaned.csv_preprocessed',
 'EmoBank_cleaned.csv_preprocessed',
 'Sentiment_Labelled_Sentences_Data_Set_cleaned_preprocessed',
 'SMILE_Twitter_Emotion_dataset_cleaned_preprocessed',
 'Affect_data_cleaned.csv_preprocessed',
 'total_combined_text_and_ids_preprocessed']

In [None]:
s = '-0.5'

In [None]:
f = float(s)

In [None]:
f

-0.5

In [None]:
max(df2['afinn_score'])

15.0