In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

In [2]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [3]:
df = pd.read_csv('/kaggle/input/journal-entries-with-labelled-emotions/data.csv')

In [4]:
def cleantext(text):
    text = str(text).lower()
    
    emotionkeywrds = {
        'happy': ['happy', 'happiness', 'joy', 'joyful', 'pleased', 'excitement', 'excited', 'glad'],
        'sad': ['sad', 'sadness', 'unhappy', 'depressed', 'depression', 'upset', 'disappointing', 'disappointed', 
                'down', 'low', 'blue', 'sorrow', 'grief', 'gloomy','broke'],
        'angry': ['angry', 'anger', 'furious', 'mad', 'rage', 'outrage', 'annoyed', 'irritated', 'frustrated'],
        'anxious': ['anxious', 'anxiety', 'worried', 'worry', 'nervous', 'tense', 'stress', 'stressed', 'fear', 'afraid'],
        'calm': ['calm', 'peaceful', 'relaxed', 'tranquil', 'serene'],
        'proud': ['proud', 'pride', 'accomplished', 'achievement', 'success', 'successful'],
        'disappointed': ['disappointed', 'disappointment', 'letdown', 'failed', 'failure', 'fail', 'less than expected']
    }
    
    allemokeywords = set()
    for words in emotionkeywrds.values():
        allemokeywords.update(words)
    
    text = re.sub(r'[^a-zA-Z\s!?.,]', '', text)
    
    stop_words = set(stopwords.words('english'))
    negation_words = {'not', 'no', 'nor', 'neither', 'never', 'none', 'barely', 'hardly', 'scarcely', 'doesnt', 'didnt', 'wasnt', 'isnt', 'arent', 'couldnt', 'shouldnt', 'wouldnt'}
    
    contractions = {'arent', 'cant', 'couldnt', 'didnt', 'doesnt', 'dont', 
                    'hadnt', 'havent', 'shouldnt', 'wouldnt', 'youve',
                    'youre', 'wont', 'werent', 'weve', 'wed', 'theyre', 'im'}
    
    stop_words.update(contractions)
    stop_words = stop_words - negation_words - allemokeywords
    
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [5]:
print("Cleaning text...")
df['cleantext'] = df['Answer'].apply(cleantext)

# Identify emotion columns
emotionalcols = [col for col in df.columns if '.f1.' in col and '.raw' in col]
emotion_names = [col.split('.f1.')[1].split('.raw')[0] for col in emotionalcols]

Cleaning text...


In [6]:
print("Class distribution for emotions:")
class_weights = {}
for i, col in enumerate(emotionalcols):
    pos_count = df[col].sum()
    total = len(df)
    print(f"{emotion_names[i]}: {pos_count} positive instances ({pos_count/total*100:.2f}%)")
    
    if pos_count > 0:
        weight = total / (2 * pos_count)
        class_weights[emotion_names[i]] = weight
    else:
        class_weights[emotion_names[i]] = 1.0

Class distribution for emotions:
afraid: 18 positive instances (1.22%)
angry: 28 positive instances (1.90%)
anxious: 125 positive instances (8.49%)
ashamed: 17 positive instances (1.15%)
awkward: 15 positive instances (1.02%)
bored: 49 positive instances (3.33%)
calm: 368 positive instances (24.98%)
confused: 28 positive instances (1.90%)
disgusted: 22 positive instances (1.49%)
excited: 251 positive instances (17.04%)
frustrated: 141 positive instances (9.57%)
happy: 730 positive instances (49.56%)
jealous: 3 positive instances (0.20%)
nostalgic: 61 positive instances (4.14%)
proud: 337 positive instances (22.88%)
sad: 43 positive instances (2.92%)
satisfied: 591 positive instances (40.12%)
surprised: 64 positive instances (4.34%)


In [7]:
X = df['cleantext']
y = df[emotionalcols]

In [8]:
print("Vectorizing text...")
vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.95,
    ngram_range=(1, 3),
    sublinear_tf=True
)
X_vec = vectorizer.fit_transform(X)

Vectorizing text...


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=df[emotionalcols[0]] if df[emotionalcols[0]].sum() > 10 else None)

print("Training model...")
base_model = LogisticRegression(
    C=1.0,
    class_weight='balanced',
    max_iter=1000,
    solver='liblinear',
    random_state=42
)
model = MultiOutputClassifier(base_model)
model.fit(X_train, y_train)

Training model...


In [10]:
print("Evaluating model...")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=emotion_names, zero_division=0))

Evaluating model...
              precision    recall  f1-score   support

      afraid       0.00      0.00      0.00         4
       angry       0.00      0.00      0.00         3
     anxious       0.44      0.33      0.38        21
     ashamed       0.00      0.00      0.00         2
     awkward       0.00      0.00      0.00         2
       bored       0.50      0.17      0.25         6
        calm       0.45      0.43      0.44        76
    confused       0.00      0.00      0.00         5
   disgusted       0.00      0.00      0.00         2
     excited       0.12      0.09      0.11        53
  frustrated       0.54      0.28      0.37        25
       happy       0.77      0.61      0.68       170
     jealous       0.00      0.00      0.00         0
   nostalgic       0.17      0.08      0.11        12
       proud       0.48      0.42      0.45        79
         sad       0.00      0.00      0.00         5
   satisfied       0.53      0.61      0.57       120
   surp

In [11]:
def get_emotion_thresholds():
    thresholds = {}
    for emotion, weight in class_weights.items():
        if weight > 5:
            thresholds[emotion] = 0.2
        elif weight > 2:
            thresholds[emotion] = 0.3
        else:
            thresholds[emotion] = 0.4
    
    overrides = {
        'happy': 0.55,
        'calm': 0.55,
        'proud': 0.5
    }
    
    thresholds.update(overrides)
    return thresholds

In [12]:
def predict_emotions_advanced(user_input, vectorizer, model, emotion_names):
    emotion_thresholds = get_emotion_thresholds()
    default_threshold = 0.35
    cleaned = cleantext(user_input)
    vectorized = vectorizer.transform([cleaned])
    input_words = set(cleaned.split())
    
    emotionkeywrds = {
        'happy': ['happy', 'joy', 'joyful', 'pleased', 'excitement', 'excited', 'glad'],
        'sad': ['sad', 'unhappy', 'depressed', 'depression', 'upset', 'disappointing', 'disappointed', 
                'down', 'low', 'blue', 'sorrow', 'grief','not','less','broke','reject'],
        'angry': ['angry', 'anger', 'furious', 'mad', 'rage', 'outrage', 'annoyed', 'irritated'],
        'anxious': ['anxious', 'anxiety', 'worried', 'worry', 'nervous', 'tense', 'stress', 'stressed'],
        'fear': ['fear', 'afraid', 'scared', 'frightened', 'terrified'],
        'surprised': ['surprised', 'surprise', 'shocking', 'shocked', 'unexpected'],
        'disgusted': ['disgust', 'disgusted', 'gross', 'revolting'],
        'calm': ['calm', 'peaceful', 'relaxed', 'tranquil', 'serene'],
        'proud': ['proud', 'pride', 'accomplished', 'achievement', 'success', 'successful'],
        'disappointed': ['disappointed', 'disappointment', 'letdown', 'failed', 'failure', 'fail', 'less than expected']
    }
    
    keywords_found = {}
    for emotion, keywords in emotionkeywrds.items():
        for word in keywords:
            if word in cleaned:
                if emotion not in keywords_found:
                    keywords_found[emotion] = []
                keywords_found[emotion].append(word)
    
    emotions_detected = []
    
    try:
        for i, estimator in enumerate(model.estimators_):
            emotion = emotion_names[i]
            threshold = emotion_thresholds.get(emotion, default_threshold)
            proba = estimator.predict_proba(vectorized)[0][1]
            if emotion in keywords_found:
                threshold *= 0.7
            if proba > threshold:
                emotions_detected.append((emotion, float(proba)))
    except:
        raw_predictions = model.predict(vectorized)[0]
        for i, val in enumerate(raw_predictions):
            emotion = emotion_names[i]
            if val == 1:
                emotions_detected.append((emotion, 0.9))
    
    emotions_detected.sort(key=lambda x: x[1], reverse=True)
    
    if "result" in cleaned and ("less" in cleaned or "fail" in cleaned or "not" in cleaned):
        if not any(e[0] in ["sad", "disappointed"] for e in emotions_detected):
            emotions_detected.append(("disappointed", 0.75))
    
    return emotions_detected

In [13]:
test = input()

predicted_emotions = predict_emotions_advanced(test, vectorizer, model, emotion_names)
    
print("\nPredicted Emotions for text:")
print(f"Text: '{test}'")
if predicted_emotions:
    for emotion, prob in predicted_emotions:
        print(f"{emotion}: {prob:.4f}")
else:
    print("No strong emotions detected.")

 i am selected as intern at ISRO



Predicted Emotions for text:
Text: 'i am selected as intern at ISRO'
satisfied: 0.4180
excited: 0.3862
anxious: 0.2945
frustrated: 0.2886
surprised: 0.2514
nostalgic: 0.2029
