In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import StringIO

from collections import Counter
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint    
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# url to get the emotion lexicon
url = 'https://raw.githubusercontent.com/aditeyabaral/lok-sabha-election-twitter-analysis/refs/heads/master/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

response = requests.get(url)

if response.status_code == 200:
    lexicon_data = StringIO(response.text)
    nrc_lexicon = pd.read_csv(lexicon_data, sep='\t', header=None)
else:
    print(f"Error: Failed to fetch lexicon from URL (Status code {response.status_code})")

# Load the NRC lexicon
nrc_lexicon.columns = ['word', 'emotion', 'association']
nrc_lexicon = nrc_lexicon[nrc_lexicon['association'] == 1]

In [6]:
# Define mapping with regular and opposite scores
# Openness to experience refers to the degree of intellectual curiosity, creativity, and openness to new ideas and experiences.
# Conscientiousness involves traits such as organization, self-discipline, reliability, and goal-directed behavior.
# Extraversion measures the extent to which someone is outgoing, sociable, energetic, and seeks stimulation from the external world.
# Agreeableness reflects traits like trust, altruism, kindness, and cooperation in social interactions.
# Neuroticism (sometimes also referred to as Emotional Stability) indicates the tendency to experience negative emotions such as anxiety, depression, and vulnerability to stress.
emotion_to_ocean = {
    'anger': {'N': 1, 'O': 0},
    'anticipation': {'O': 1, 'C': 1},
    'disgust': {'N': 1, 'O': 0},
    'fear': {'N': 1, 'O': 0},
    'joy': {'E': 1, 'A': 1},
    'negative': {'N': 1, 'O': 0},
    'positive': {'E': 1, 'A': 1},
    'sadness': {'N': 1, 'O': 0},
    'surprise': {'O': 1, 'C': 0},
    'trust': {'A': 1, 'C': 1}
}

In [8]:
def extract_emotion_features(text):
    tokens = text.lower().split()
    emotion_counts = Counter()
    for token in tokens:
        emotions = nrc_lexicon[nrc_lexicon['word'] == token]['emotion'].values
        emotion_counts.update(emotions)
    return emotion_counts

def generate_ocean_scores(emotion_counts, emotion_to_ocean):
    ocean_scores = {trait: 0 for trait in 'OCEAN'}
    for emotion, count in emotion_counts.items():
        for trait, weight in emotion_to_ocean.get(emotion, {}).items():
            if trait in ocean_scores:
                ocean_scores[trait] += count * weight
            else:
                opposite_trait = trait.lower() if trait.isupper() else trait.upper()
                ocean_scores[opposite_trait] -= count * weight
    return ocean_scores


In [40]:
ceo_transcript_data = pd.read_excel('../dataset/combined_hvp_numeric.xlsx')[['ID','Message']].drop_duplicates()

In [43]:
ceo_transcript_data

Unnamed: 0,ID,Message
0,12373,"I can morning. i'm Okay, how are you. that's a..."
1,5197,i'm doing good give me a. Second. yeah I can h...
2,7906,morning. Barely good one are barely. that's al...
3,2064,"How are you. How. A lot a lot Oh, she got the ..."
4,17043,yeah I went to that page and I don't know what...
...,...,...
653,12800,Hello. Good how are you. Not much has happened...
654,12622,"There we go. i'm all right, I think I need to ..."
655,13557,Can you hear me. I know just confused I. yeah ...
656,12722,Hello it's morning. You thought for sure I was...


In [50]:
prepared_dataset = []
for index,row in ceo_transcript_data.iterrows():
    emotion = extract_emotion_features(row['Message'])
    ocean_scores = generate_ocean_scores(emotion,emotion_to_ocean)
    ocean_scores['ID'] = row['ID']
    ocean_scores['Message'] = row['Message']
    prepared_dataset.append(ocean_scores)

In [51]:
prepared_dataset = pd.DataFrame(prepared_dataset)

In [53]:
prepared_dataset.to_excel('../dataset/ocean_prepared_dataset.xlsx',index=False)

In [None]:
# emotion_features = [extract_emotion_features(text) for text in texts]
# ocean_scores = [generate_ocean_scores(emotion, emotion_to_ocean) for emotion in emotion_features]

# emotion_df = pd.DataFrame(emotion_features).fillna(0)
# ocean_df = pd.DataFrame(ocean_scores).fillna(0)