<a href="https://colab.research.google.com/github/harsh-ux/PRML-project/blob/main/Emotional_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing Libraries and Dependencies

In [1]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc, confusion_matrix, roc_auc_score, recall_score, precision_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud, STOPWORDS
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm_notebook as tqdm
from tqdm import trange
from sklearn.preprocessing import Normalizer
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Analysing Emotions

In [13]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
data = pd.read_csv('/content/drive/MyDrive/ML/datasets/preprocessed_data.csv')

Mounted at /content/drive


In [2]:
# Make data directory if it doesn't exist
!mkdir -p nrcdata
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/upshot-trump-emolex/data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt -P nrcdata

--2021-05-15 06:19:11--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/upshot-trump-emolex/data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2581050 (2.5M) [text/plain]
Saving to: ‘nrcdata/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt’


2021-05-15 06:19:12 (5.41 MB/s) - ‘nrcdata/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt’ saved [2581050/2581050]



In [3]:
filepath = "/content/nrcdata/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t', keep_default_na=False)
emolex_df.head(12)

Unnamed: 0,word,emotion,association
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
5,aback,negative,0
6,aback,positive,0
7,aback,sadness,0
8,aback,surprise,0
9,aback,trust,0


In [4]:
emolex_df.emotion.unique()

array(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
       'positive', 'sadness', 'surprise', 'trust'], dtype=object)

In [5]:
emolex_df.emotion.value_counts()

fear            14182
joy             14182
negative        14182
anticipation    14182
trust           14182
anger           14182
disgust         14182
positive        14182
surprise        14182
sadness         14182
Name: emotion, dtype: int64

In [6]:
emolex_df[emolex_df.association == 1].emotion.value_counts()

negative        3324
positive        2312
fear            1476
anger           1247
trust           1231
sadness         1191
disgust         1058
anticipation     839
joy              689
surprise         534
Name: emotion, dtype: int64

In [7]:
emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_words.head()

emotion,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,0,0,1,0,1,0,1,0,0
3,abandoned,1,0,0,1,0,1,0,1,0,0
4,abandonment,1,0,0,1,0,1,0,1,1,0


In [8]:
def extract_review_emotion(df,column):
    new_df = df.copy()
    
    emotions = emolex_words.columns.drop('word')
    emo_df = pd.DataFrame(0, index=df.index, columns=emotions)    
    with tqdm(total=len(list(new_df.iterrows()))) as pbar:
        for i, row in new_df.iterrows():
            pbar.update(1)
            document = word_tokenize(new_df.loc[i][column])
            for word in document:
                emo_score = emolex_words[emolex_words.word == word]
                if not emo_score.empty:
                    for emotion in list(emotions):
                        emo_df.at[i, emotion] += emo_score[emotion]

    new_df = pd.concat([new_df, emo_df], axis=1)

    return new_df

In [None]:
def extract_review_emotion_edit(df,column):
    new_df = df.copy()
    
    emotions = emolex_words.columns.drop('word')
    emo_df = pd.DataFrame(0, index=df.index, columns=emotions)    
    with tqdm(total=len(list(new_df.iterrows()))) as pbar:
        for i, row in new_df.iterrows():
            pbar.update(1)
            if (i==1): 
              print(row['review_clean'].split())
            #document = word_tokenize(new_df.loc[i][column])
            for word in row['review_clean'].split():
                emo_score = emolex_words[emolex_words.word == word]
                if not emo_score.empty:
                    for emotion in list(emotions):
                        emo_df.at[i, emotion] += emo_score[emotion]

    new_df = pd.concat([new_df, emo_df], axis=1)

    return new_df

In [None]:
emotion_df = extract_review_emotion(data, 'review_clean')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=156432.0), HTML(value='')))

In [10]:
!pip install NRCLex

Collecting NRCLex
[?25l  Downloading https://files.pythonhosted.org/packages/41/1c/0097ee39d456c8a92b2eb5dfd59f581a09a6bafede184a058fb0f19bb6ea/NRCLex-3.0.0.tar.gz (396kB)
[K     |▉                               | 10kB 12.6MB/s eta 0:00:01[K     |█▋                              | 20kB 18.1MB/s eta 0:00:01[K     |██▌                             | 30kB 15.8MB/s eta 0:00:01[K     |███▎                            | 40kB 11.5MB/s eta 0:00:01[K     |████▏                           | 51kB 7.8MB/s eta 0:00:01[K     |█████                           | 61kB 6.9MB/s eta 0:00:01[K     |█████▉                          | 71kB 7.3MB/s eta 0:00:01[K     |██████▋                         | 81kB 7.7MB/s eta 0:00:01[K     |███████▍                        | 92kB 8.0MB/s eta 0:00:01[K     |████████▎                       | 102kB 7.2MB/s eta 0:00:01[K     |█████████                       | 112kB 7.2MB/s eta 0:00:01[K     |██████████                      | 122kB 7.2MB/s eta 0:00:01[K

In [None]:
from nrclex import NRCLex
  
# Assign list of strings
text = ['i']
  
# Iterate through list
for i in range(len(text)):
  
    # Create object
    emotion = NRCLex(text[i])
  
    # Classify emotion
    print('\n\n', text[i], ': ', emotion.top_emotions)


In [None]:
emotion_df.to_csv('emotion_sentiment.csv',index=False)