In [None]:
import os
import pickle
import joblib
import pandas as pd
import numpy as np
import re
import warnings
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score
from scipy.sparse import hstack
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'transformers'

In [2]:
# Load dataset
print('Loading dataset...')
df = pd.read_csv('Twitter_Sentiments.csv', encoding='latin-1', header=None, on_bad_lines='skip')
df = df[[0, 5]]
df.columns = ['label', 'tweet']

# Map: 0 (negative) -> 1, 2 (neutral) -> 0, 4 (positive) -> 2
print('Mapping labels...')
df['label'] = df['label'].map({0: 1, 2: 0, 4: 2})  # 1=negative, 0=neutral, 2=positive
df = df[df['label'].isin([0, 1, 2])]
df = df.dropna(subset=['tweet'])
df['tweet'] = df['tweet'].astype(str)

Loading dataset...
Mapping labels...


In [3]:
# Slang dictionary
slang_dict = {
    "gr8": "great", "luv": "love", "b4": "before", "u": "you", "ur": "your", "omg": "oh my god",
    "idk": "i do not know", "smh": "shaking my head", "tbh": "to be honest", "lmao": "laughing",
    "wtf": "what the heck", "btw": "by the way", "thx": "thanks", "pls": "please", "plz": "please",
    "imo": "in my opinion", "imho": "in my humble opinion", "fyi": "for your information", "brb": "be right back",
    "bff": "best friend forever", "rofl": "rolling on the floor laughing", "afaik": "as far as i know",
    "irl": "in real life", "jk": "just kidding", "np": "no problem", "ty": "thank you", "yw": "you are welcome",
    "gg": "good game", "ftw": "for the win", "atm": "at the moment", "bc": "because", "cya": "see you",
    "dm": "direct message", "fb": "facebook", "fomo": "fear of missing out", "hmu": "hit me up",
    "icymi": "in case you missed it", "ily": "i love you", "lmk": "let me know", "nvm": "never mind",
    "omw": "on my way", "tba": "to be announced", "tbd": "to be decided", "tgif": "thank god it's friday",
    "ttyl": "talk to you later", "wyd": "what are you doing", "ya": "you", "tho": "though", "wanna": "want to",
    "gonna": "going to", "gotta": "got to", "kinda": "kind of", "sorta": "sort of", "ain't": "is not",
    "w8": "wait", "bday": "birthday", "cuz": "because", "coz": "because", "dunno": "do not know",
    "sup": "what is up", "yo": "hello", "fam": "family", "bae": "before anyone else", "lit": "amazing",
    "salty": "bitter", "savage": "bold", "slay": "succeed", "fire": "excellent", "goat": "greatest of all time",
    "noob": "newbie", "stan": "support", "tea": "gossip", "vibe": "atmosphere", "yeet": "throw", "sus": "suspicious",
    "cap": "lie", "bet": "okay", "flex": "show off", "ghost": "ignore", "lowkey": "quietly", "highkey": "openly",
    "mood": "relatable", "shade": "insult", "ship": "support relationship", "snatched": "perfect", "thirsty": "desperate",
    "woke": "aware", "yolo": "you only live once"
}
def replace_slang(text):
    words = text.split()
    return ' '.join([slang_dict.get(w.lower(), w) for w in words])
df['tweet'] = df['tweet'].apply(replace_slang)

def handle_negation(text):
    text = re.sub(r'not ([a-zA-Z]+)', r'not_\1', text)
    return text
df['tweet'] = df['tweet'].apply(handle_negation)

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [4]:
# Sarcasm detection (rule-based approach - no model download required)
print('Setting up rule-based sarcasm detection...')

# Mixed sentiment detection
positive_words = set([
    "good", "great", "excellent", "amazing", "love", "fantastic", "awesome", "wonderful", "superb", "outstanding",
    "brilliant", "positive", "enjoy", "happy", "pleased", "delight", "satisfied", "best", "favorite", "cool", "nice",
    "perfect", "impressive", "sweet", "beautiful", "fun", "success", "win", "winning", "blessed", "grateful", "excited"
])
negative_words = set([
    "bad", "terrible", "awful", "hate", "worst", "poor", "disappoint", "sad", "angry", "upset", "horrible", "negative",
    "unhappy", "annoyed", "frustrated", "fail", "failure", "problem", "issue", "sucks", "lame", "boring", "dull",
    "disgust", "regret", "pain", "annoy", "unimpressed", "mediocre", "crap", "garbage", "trash", "bug", "broken"
])
# Sarcasm indicators
sarcasm_indicators = {
    'exaggeration_words': ['literally', 'obviously', 'clearly', 'totally', 'completely', 'absolutely', 'definitely'],
    'irony_words': ['sure', 'right', 'yeah', 'okay', 'whatever', 'fine', 'great'],
    'sarcasm_phrases': ['oh great', 'wonderful', 'fantastic', 'brilliant', 'genius', 'smart', 'clever'],
    'question_marks': ['?', '??', '???'],
    'capitalization': ['ALL CAPS', 'MiXeD cAsE'],
    'emoticons': [':)', ':-)', ';)', ';-)', ':/', ':-/', ':|', ':-|'],
    'hashtags': ['#sarcasm', '#not', '#irony', '#sure', '#whatever']
}

def detect_sarcasm_rule_based(text):
    """
    Rule-based sarcasm detection using linguistic patterns
    """
    text_lower = text.lower()
    sarcasm_score = 0
    
    # Check for exaggeration words
    for word in sarcasm_indicators['exaggeration_words']:
        if word in text_lower:
            sarcasm_score += 1
    
    # Check for irony words (especially when used with negative context)
    irony_count = sum(1 for word in sarcasm_indicators['irony_words'] if word in text_lower)
    if irony_count > 0:
        # Check if there are negative words nearby
        negative_context = any(word in text_lower for word in negative_words)
        if negative_context:
            sarcasm_score += irony_count * 2
    
    # Check for sarcasm phrases
    for phrase in sarcasm_indicators['sarcasm_phrases']:
        if phrase in text_lower:
            sarcasm_score += 2
    
    # Check for excessive punctuation
    if text.count('!') > 2 or text.count('?') > 2:
        sarcasm_score += 1
    
    # Check for ALL CAPS
    if text.isupper() and len(text) > 5:
        sarcasm_score += 2
    
    # Check for mixed case (sArCaSm)
    if any(c.isupper() for c in text[1:]) and any(c.islower() for c in text[1:]):
        sarcasm_score += 1
    
    # Check for emoticons
    for emoticon in sarcasm_indicators['emoticons']:
        if emoticon in text:
            sarcasm_score += 1
    
    # Check for sarcasm hashtags
    for hashtag in sarcasm_indicators['hashtags']:
        if hashtag in text_lower:
            sarcasm_score += 3
    
    # Check for contradiction patterns
    contradiction_patterns = [
        ('good', 'bad'), ('great', 'terrible'), ('love', 'hate'),
        ('amazing', 'awful'), ('perfect', 'worst'), ('best', 'worst')
    ]
    
    for pos_word, neg_word in contradiction_patterns:
        if pos_word in text_lower and neg_word in text_lower:
            sarcasm_score += 2
    
    # Check for "not" + positive word patterns
    not_positive_patterns = [
        'not good', 'not great', 'not amazing', 'not perfect', 'not love',
        'not happy', 'not excited', 'not thrilled'
    ]
    
    for pattern in not_positive_patterns:
        if pattern in text_lower:
            sarcasm_score += 1
    
    # Return 1 if sarcasm score is high enough, 0 otherwise
    return int(sarcasm_score >= 2)

print('Detecting sarcasm using rule-based approach...')
df['sarcasm'] = df['tweet'].apply(detect_sarcasm_rule_based)


def is_mixed_sentiment(text):
    words = set(text.lower().split())
    return int(len(words & positive_words) > 0 and len(words & negative_words) > 0)
df['mixed_sentiment'] = df['tweet'].apply(is_mixed_sentiment)

# Copy this code and add it as a new cell BEFORE the model training cell



Setting up rule-based sarcasm detection...
Detecting sarcasm using rule-based approach...


In [5]:
print('Preprocessing text data...')

# Apply text preprocessing
df['tweet'] = df['tweet'].apply(remove_pattern, pattern="@[\w]*")
df['tweet'] = df['tweet'].apply(remove_emojis)
df['tweet'] = df['tweet'].apply(lambda x: re.sub("[^a-zA-Z#]", " ", x))
df['tweet'] = df['tweet'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
df['tweet'] = df['tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))

# Stemming
stemmer = PorterStemmer()
df['tweet'] = df['tweet'].apply(lambda x: " ".join([stemmer.stem(w) for w in x.split()]))

print('Creating Bag of Words features...')
# Create Bag of Words features
bow_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
bow_features = bow_vectorizer.fit_transform(df['tweet'])

print('Adding extra features...')
# Add extra features (sarcasm and mixed sentiment)
extra_features = np.column_stack([df['sarcasm'], df['mixed_sentiment']])

# Combine BOW features with extra features
bow_with_features = hstack([bow_features, extra_features])

print(f'Feature matrix shape: {bow_with_features.shape}')
print('Preprocessing completed!')
print('Checking label distribution...')
print(f'Unique labels in dataset: {sorted(df["label"].unique())}')
print(f'Label counts: {df["label"].value_counts().sort_index()}')

# Fix the label mapping issue
if 0 not in df['label'].unique():
    print('Remapping labels to [0, 1] for 2-class problem...')
    df['label'] = df['label'].map({1: 0, 2: 1})  # 1->0 (negative), 2->1 (neutral)
    num_classes = 2
else:
    num_classes = 3

print(f'After remapping - Unique labels: {sorted(df["label"].unique())}')
print(f'Label counts: {df["label"].value_counts().sort_index()}')

print('Splitting data...')
x_train, x_test, y_train, y_test = train_test_split(bow_with_features, df['label'], random_state=42, test_size=0.25)

print('Training XGBoost...')
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    objective='multi:softmax', 
    num_class=num_classes
)
xgb_model.fit(x_train, y_train)

# Save model and vectorizer
pickle.dump(xgb_model, open('regmodel.pkl', 'wb'))
joblib.dump(bow_vectorizer, 'bow_vectorizer.joblib')

print('Evaluating...')
pred = xgb_model.predict(x_test)
f1 = f1_score(y_test, pred, average='weighted')
accuracy = accuracy_score(y_test, pred)
print(f'F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}')

Preprocessing text data...
Creating Bag of Words features...
Adding extra features...
Feature matrix shape: (1340179, 5002)
Preprocessing completed!
Checking label distribution...
Unique labels in dataset: [np.int64(0), np.int64(1), np.int64(2)]
Label counts: label
0     25000
1    800000
2    515179
Name: count, dtype: int64
After remapping - Unique labels: [np.int64(0), np.int64(1), np.int64(2)]
Label counts: label
0     25000
1    800000
2    515179
Name: count, dtype: int64
Splitting data...
Training XGBoost...
Evaluating...
F1 Score: 0.7089, Accuracy: 0.7279


In [6]:
print('Checking label distribution...')
print(f'Unique labels in dataset: {sorted(df["label"].unique())}')
print(f'Label counts: {df["label"].value_counts().sort_index()}')

# FIXED: Ensure we have 3 classes for proper sentiment analysis
print('Setting up 3-class sentiment analysis...')
print(f'Current unique labels: {sorted(df["label"].unique())}')

# Make sure we have labels 0, 1, 2
if 0 not in df['label'].unique():
    print('Remapping labels to ensure 3 classes...')
    # Map: 1->0 (negative), 2->1 (neutral), and we'll need positive class
    df['label'] = df['label'].map({1: 0, 2: 1})  # 1->0 (negative), 2->1 (neutral)
    
    # If we don't have positive examples, we can create some from neutral
    # For now, let's work with what we have and ensure 3 classes
    print('Note: Model will be trained with available classes')
    
num_classes = 3  # Force 3 classes
print(f'Final unique labels: {sorted(df["label"].unique())}')
print(f'Label counts: {df["label"].value_counts().sort_index()}')



print('Splitting data...')
x_train, x_test, y_train, y_test = train_test_split(bow_with_features, df['label'], random_state=42, test_size=0.25)

print('Training XGBoost...')
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    objective='multi:softmax', 
    num_class=num_classes
)
xgb_model.fit(x_train, y_train)

# Save model and vectorizer
pickle.dump(xgb_model, open('regmodel.pkl', 'wb'))
joblib.dump(bow_vectorizer, 'bow_vectorizer.joblib')

print('Evaluating...')
pred = xgb_model.predict(x_test)
f1 = f1_score(y_test, pred, average='weighted')
accuracy = accuracy_score(y_test, pred)
print(f'F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}')

Checking label distribution...
Unique labels in dataset: [np.int64(0), np.int64(1), np.int64(2)]
Label counts: label
0     25000
1    800000
2    515179
Name: count, dtype: int64
Setting up 3-class sentiment analysis...
Current unique labels: [np.int64(0), np.int64(1), np.int64(2)]
Final unique labels: [np.int64(0), np.int64(1), np.int64(2)]
Label counts: label
0     25000
1    800000
2    515179
Name: count, dtype: int64
Splitting data...
Training XGBoost...
Evaluating...
F1 Score: 0.7089, Accuracy: 0.7279


In [7]:
print('Splitting data...')
x_train, x_test, y_train, y_test = train_test_split(bow_with_features, df['label'], random_state=42, test_size=0.25)

print('Training XGBoost...')
# FIXED: Force 3-class classification
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    objective='multi:softmax', 
    num_class=3  # Force 3 classes: 0=neutral, 1=negative, 2=positive
)
xgb_model.fit(x_train, y_train)

# Save model and vectorizer
pickle.dump(xgb_model, open('regmodel.pkl', 'wb'))
joblib.dump(bow_vectorizer, 'bow_vectorizer.joblib')

print('Evaluating...')
pred = xgb_model.predict(x_test)
f1 = f1_score(y_test, pred, average='weighted')
accuracy = accuracy_score(y_test, pred)
print(f'F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}')

Splitting data...
Training XGBoost...
Evaluating...
F1 Score: 0.7089, Accuracy: 0.7279


In [8]:
print('Ready for prediction!')
def preprocess_review(review):
    review = replace_slang(review)
    review = handle_negation(review)
    review = remove_pattern(review, "@[\w]*")
    review = re.sub("[^a-zA-Z#]", " ", review)
    review = remove_emojis(review)
    review = review.encode('ascii', 'ignore').decode('ascii')
    review = " ".join([w for w in review.split() if len(w) > 3])
    review = " ".join([stemmer.stem(w) for w in review.split()])
    return review

def predict_sentiment(review):
    sarcasm = detect_sarcasm_rule_based(review)
    mixed = is_mixed_sentiment(review)
    processed_review = preprocess_review(review)
    review_bow = bow_vectorizer.transform([processed_review])
    extra = np.array([[sarcasm, mixed]])
    review_with_features = hstack([review_bow, extra])
    prediction = xgb_model.predict(review_with_features)[0]
    if prediction == 2:
        return 'positive'
    elif prediction == 1:
        return 'negative'
    else:
        return 'neutral'

# Example usage
test_texts = [
    "not bad", "yeah right, that was helpful", "the food was good but the service was terrible",
    "idk if I love or hate this", "this is lit", "what a surprise, another bug", "meh", "so helpful, thanks a lot"
]
for text in test_texts:
    print(f'Input: {text} -> Prediction: {predict_sentiment(text)}')

Ready for prediction!
Input: not bad -> Prediction: negative
Input: yeah right, that was helpful -> Prediction: negative
Input: the food was good but the service was terrible -> Prediction: negative
Input: idk if I love or hate this -> Prediction: negative
Input: this is lit -> Prediction: positive
Input: what a surprise, another bug -> Prediction: negative
Input: meh -> Prediction: negative
Input: so helpful, thanks a lot -> Prediction: positive


In [9]:
import pickle
pickle.dump(xgb_model, open('regmodel.pkl', 'wb'))

import joblib
joblib.dump(bow_vectorizer, 'bow_vectorizer.joblib')

['bow_vectorizer.joblib']

In [10]:
pickled_model=pickle.load(open('regmodel.pkl','rb'))