# **Task 2**

In [None]:
import numpy as np
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change this path
data_path = "/content/drive/MyDrive/Colab_data/NLP_AS1/corpus.txt"

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample):
    emotion=classifier(sample)
    return emotion[0]

# Function to get the emotion corresponding to the highest probability for a given sentence.
def get_emotion(sentence):
    score = emotion_scores(sentence)
    sc = -1
    for i in score:
        if i["score"] > sc:
            sc = i["score"]
            emo = i["label"]
    return emo

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [None]:
class BigramLM:
    def __init__(self, data_path, st):
        self.delta = 0.5
        self.prob = {}
        self.beta_prob = {}
        self.beta_prob["sadness"] = {}
        self.beta_prob["joy"] = {}
        self.beta_prob["love"] = {}
        self.beta_prob["anger"] = {}
        self.beta_prob["fear"] = {}
        self.beta_prob["surprise"] = {}
        self.emo_prob = {}
        self.emo_prob["sadness"] = {}
        self.emo_prob["joy"] = {}
        self.emo_prob["love"] = {}
        self.emo_prob["anger"] = {}
        self.emo_prob["fear"] = {}
        self.emo_prob["surprise"] = {}
        self.emo_freq = {}
        self.emo_freq["sadness"] = {}
        self.emo_freq["joy"] = {}
        self.emo_freq["love"] = {}
        self.emo_freq["anger"] = {}
        self.emo_freq["fear"] = {}
        self.emo_freq["surprise"] = {}
        self.smoothing_type = st
        self.unique_words = ["."]
        self.no_of_unique_words = 1
        with open(data_path) as f:
            self.string_data = f.readlines()

    def load_freq_emotion(self, path):
        with open(path) as json_file:
            self.emo_freq = json.load(json_file)

    def load_prob_emotion(self, path):
        with open(path) as json_file:
            self.emo_prob = json.load(json_file)

    def load_beta_prob(self, path):
        with open(path) as json_file:
            self.beta_prob = json.load(json_file)

    def set_prob_emo(self, is_save):
        for k in self.emo_freq:
            for i in self.emo_freq[k]:
                if i not in self.emo_prob[k]:
                    self.emo_prob[k][i] = {}
                cnt_w1 = sum(self.emo_freq[k][i].values())
                for j in self.emo_freq[k][i]:
                    self.emo_prob[k][i][j] = self.emo_freq[k][i][j]/cnt_w1
        if is_save:
            with open("/content/drive/MyDrive/Colab_data/NLP_AS1/emotion_prob.json", "w") as outfile:
                json.dump(self.emo_prob, outfile)

    def set_prob_beta(self):
        weight = 1000
        for k in self.emo_prob:
            for i in self.unique_words:
                for j in self.unique_words:
                    if (i not in self.emo_prob[k]) or (j not in self.emo_prob[k][i]):
                        emotion = 0
                    else:
                        emotion = self.emo_prob[k][i][j]
                    if (i not in self.prob) or (j not in self.prob[i]):
                        pr = 0
                    else:
                        pr = self.prob[i][j]
                    if (pr == 0) and (emotion == 0):
                        continue
                    if i not in self.beta_prob[k]:
                        self.beta_prob[k][i] = {}
                    self.beta_prob[k][i][j] = pr + (weight*emotion)
        for k in self.beta_prob:
            for i in self.beta_prob[k]:
                cnt = sum(self.beta_prob[k][i].values())
                for j in self.beta_prob[k][i]:
                    self.beta_prob[k][i][j] = self.beta_prob[k][i][j]/cnt



#   smoothing_type = [(0 => normal), (1 => Laplace), (2 => Kneser-Ney)]
    def learn_model(self):
        word_data = []
        word_data.append(".")
        for w in self.string_data:
            for x in w.split():
                if x not in self.unique_words:
                    self.no_of_unique_words += 1
                    self.unique_words.append(x)
                word_data.append(x)
            word_data.append(".")
        rem = [" "]
        if self.smoothing_type == 1: # Laplace Smoothing
            for a1 in self.unique_words:
                for a2 in self.unique_words:
                    if a1 in self.prob:
                        self.prob[a1][a2] = 1
                    else:
                        self.prob[a1] = {}
                        self.prob[a1][a2] = 1
        for i in range(len(word_data)-1):
            word1 = word_data[i]
            word2 = word_data[i+1]
            if (word1 in rem) or (word2 in rem):
                continue
            if word1 in self.prob:
                if word2 in self.prob[word1]:
                    self.prob[word1][word2] += 1
                else:
                    self.prob[word1][word2] = 1
            else:
                self.prob[word1] = {}
                self.prob[word1][word2] = 1
        if self.smoothing_type == 2:    # Kneser-Ney smoothing
            no = 0
            dic_helper = {}
            for i in self.prob:
                for j in self.prob[i]:
                    no += 1
                    if j in dic_helper:
                        dic_helper[j] += self.prob[i][j]
                    else:
                        dic_helper[j] = self.prob[i][j]
            dic_helper2 = {}
            for i in self.prob:
                for j in self.prob[i]:
                    if j in dic_helper2:
                        dic_helper2[j] += 1
                    else:
                        dic_helper2[j] = 1
            for i in self.prob:
                cnt_w1 = sum(self.prob[i].values())
                for j in self.prob[i]:
                    la = (self.delta/dic_helper[i])*(len(self.prob[i].keys()))
                    cont_prob = (dic_helper2[j]/no)
                    self.prob[i][j] = ((max(self.prob[i][j]-self.delta, 0))/cnt_w1)+(la * cont_prob)
        else:
            for i in self.prob:
                cnt_w1 = sum(self.prob[i].values())
                for j in self.prob[i]:
                    self.prob[i][j] = self.prob[i][j]/cnt_w1

    def generate_sentence(self):
        first_word = np.random.choice(list(self.prob["."].keys()), p = list(self.prob["."].values()))
        sentence = first_word+" "
        last_word = first_word
        while(True):
            last_word = np.random.choice(list(self.prob[last_word].keys()), p = list(self.prob[last_word].values()))
            if(last_word == "."):
                break
            sentence += last_word+" "
        return sentence

    # Function to learn frequency of bigrams corresponding to each emotion.
    def learn_emotion_freq(self, n):
        c = 0
        while(c < n):
            c += 1
            sentence = self.generate_sentence()
            word_data = ["."]+sentence.split()
            word_data.append(".")
            emotion = get_emotion(sentence)
            for i in range(len(word_data)-1):
                word1 = word_data[i]
                word2 = word_data[i+1]
                if word1 in self.emo_freq[emotion]:
                    if word2 in self.emo_freq[emotion][word1]:
                        self.emo_freq[emotion][word1][word2] += 1
                    else:
                        self.emo_freq[emotion][word1][word2] = 1
                else:
                    self.emo_freq[emotion][word1] = {}
                    self.emo_freq[emotion][word1][word2] = 1
            if (c % 25000) == 0:
                try:
                    os.remove("/content/drive/MyDrive/Colab_data/NLP_AS1/emotion.json")
                except:
                    pass
                with open("/content/drive/MyDrive/Colab_data/NLP_AS1/emotion.json", "w") as outfile:
                    json.dump(self.emo_freq, outfile)


    def generate_sentence_emo(self, emotion, mode):
        if mode == 0:   # Using conditional probability
            first_word = np.random.choice(list(self.emo_prob[emotion]["."].keys()), p = list(self.emo_prob[emotion]["."].values()))
            sentence = first_word+" "
            last_word = first_word
            while(True):
                last_word = np.random.choice(list(self.emo_prob[emotion][last_word].keys()), p = list(self.emo_prob[emotion][last_word].values()))
                if(last_word == "."):
                    break
                sentence += last_word+" "
            return sentence
        else:   # Using beta component.
            first_word = np.random.choice(list(self.beta_prob[emotion]["."].keys()), p = list(self.beta_prob[emotion]["."].values()))
            sentence = first_word+" "
            last_word = first_word
            while(True):
                last_word = np.random.choice(list(self.beta_prob[emotion][last_word].keys()), p = list(self.beta_prob[emotion][last_word].values()))
                if(last_word == "."):
                    break
                sentence += last_word+" "
            return sentence

In [None]:
# No Smoothing
BiLM = BigramLM(data_path, 0)
BiLM.learn_model()

In [None]:
# Load Emotion frequencey file, generate probabilites
BiLM.load_freq_emotion("/content/drive/MyDrive/Colab_data/NLP_AS1/emotion.json")
BiLM.set_prob_emo(False)
BiLM.set_prob_beta()

In [None]:
# Kneser-Ney Smoothing
knLM = BigramLM(data_path, 2)
knLM.learn_model()

In [None]:
# Laplace Smoothing
laplLM = BigramLM(data_path, 1)
laplLM.learn_model()

In [None]:
# Function to get top 5 bigrams
def get_top5_bigrams(lm):
    ls = []
    pr = lm.prob
    while True:
        _max = -1
        max_i = -1
        max_j = -1
        for i in pr:
            temp = max(pr[i].values())
            if temp > _max:
                temp_j = list(pr[i].keys())[list(pr[i].values()).index(temp)]
                if (i, temp_j, temp) in ls:
                    continue
                _max = temp
                max_i = i
                max_j = temp_j
        ls.append((max_i, max_j, _max))
        if len(ls) == 5:
            break
    return ls

In [None]:
# Top 5 bigrams
print(get_top5_bigrams(BiLM))
print(get_top5_bigrams(knLM))
print(get_top5_bigrams(laplLM))

[('href', 'http', 1.0), ('mooshilu', '.', 1.0), ('tychelle', 'to', 1.0), ('hang', 'out', 1.0), ('nonexistent', 'social', 1.0)]
[('href', 'http', 0.9800015575717457), ('don', 't', 0.974589470946997), ('didn', 't', 0.9722438551631339), ('sort', 'of', 0.9710062600640895), ('supposed', 'to', 0.9455888270186779)]
[('.', 'i', 0.2693486590038314), ('i', 'feel', 0.11042412409155006), ('feel', 'like', 0.035092684307343996), ('that', 'i', 0.026501766784452298), ('and', 'i', 0.023100392270812144)]


In [None]:
# Function to get 50 sentences of given emotion
def get_50(emotion):
    ls = []
    while True:
        sent = BiLM.generate_sentence_emo(emotion, 1)
        if len(sent) < 15:
            continue
        ls.append(sent)
        if len(ls) == 50:
            break
    return ls

In [None]:
# Getting 50 sentences of each emotion.
sadness = get_50("sadness")
joy = get_50("joy")
love = get_50("love")
anger = get_50("anger")
fear = get_50("fear")
surprise = get_50("surprise")

In [None]:
#generating 50 sentence for 6 emotion

DummyName="gen_"
names_file=["sadness.txt","joy.txt","love.txt","surprise.txt","fear.txt","anger.txt"]


for i in range(len(names_file)):
    file_name="/content/drive/MyDrive/Colab_data/NLP_AS1/"+DummyName+names_file[i]
    emotion_name=names_file[i].split('.')[0]
    if emotion_name == "sadness":
        with open(file_name, "w") as outfile:
            for sentence in sadness:
                sentence=sentence+"\n"
                outfile.write(sentence)
    elif emotion_name == "joy":
        with open(file_name, "w") as outfile:
            for sentence in joy:
                sentence=sentence+"\n"
                outfile.write(sentence)
    elif emotion_name == "love":
        with open(file_name, "w") as outfile:
            for sentence in love:
                sentence=sentence+"\n"
                outfile.write(sentence)
    elif emotion_name == "surprise":
        with open(file_name, "w") as outfile:
            for sentence in surprise:
                sentence=sentence+"\n"
                outfile.write(sentence)
    elif emotion_name == "fear":
        with open(file_name, "w") as outfile:
            for sentence in fear:
                sentence=sentence+"\n"
                outfile.write(sentence)
    elif emotion_name == "anger":
        with open(file_name, "w") as outfile:
            for sentence in anger:
                sentence=sentence+"\n"
                outfile.write(sentence)

# **Extrinsic Evaluation**

In [None]:
# creating test sets
test_x = sadness+joy+love+anger+fear+surprise
test_y = ["sadness\n"]*50+["joy\n"]*50+["love"]*50+["anger\n"]*50+["fear"]*50+["surprise\n"]*50

In [None]:
# Training
with open("/content/drive/MyDrive/Colab_data/NLP_AS1/labels.txt") as f:
    train_y = f.readlines()

with open("/content/drive/MyDrive/Colab_data/NLP_AS1/corpus.txt") as f:
    train_x = f.readlines()

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
model = make_pipeline(TfidfVectorizer(), GridSearchCV(SVC(),param_grid,refit=True,verbose=2))
model.fit(train_x, train_y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.7s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.4s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.4s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.5s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.5s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   1.3s
[CV] END .....................C=0.1, gamma=1, k

In [None]:
# Printing Accuracy

pred = model.predict(test_x)
print("Accuracy:",accuracy_score(test_y, pred))
print("Macro F1 Score:", f1_score(test_y, pred, average='macro'))

Accuracy: 0.3566666666666667
Macro F1 Score: 0.26092912614838104
