In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import string

import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import emoji

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Preprocess class
class Preprocess:
    def remove_non_letters(self, text):
        return re.sub(r'[^a-zA-Z]', ' ', text)

    def remove_stop_words(self, text):
        stop_words = set(stopwords.words('english')) - {"never", "not", "nor"}
        tokens = word_tokenize(text)
        filtered_words = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_words)

    def normalize_words(self, text):
        return text.lower()

    def remove_short_words(self, text, min_length=2):
        return ' '.join([word for word in text.split() if len(word) >= min_length])

    def remove_long_words(self, text, max_length=15):
        return ' '.join([word for word in text.split() if len(word) <= max_length])

    def get_wordnet_pos(self, tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters and punctuation
        return text

    def remove_twitter_handles(self, text):
        return re.sub(r'@\w+', '', text)

    def expand_contractions(self, text):
        contractions_dict = {
            "didn't": "did not", "doesn't": "does not", "don't": "do not", "aren't": "are not",
            "can't": "cannot", "couldn't": "could not", "hadn't": "had not", "hasn't": "has not",
            "haven't": "have not", "he's": "he is", "I'm": "I am", "it's": "it is", "let's": "let us",
            "mightn't": "might not", "mustn't": "must not", "shan't": "shall not", "she's": "she is",
            "shouldn't": "should not", "that's": "that is", "there's": "there is", "they're": "they are",
            "we're": "we are", "weren't": "were not", "who's": "who is", "won't": "will not",
            "wouldn't": "would not", "you're": "you are", "you've": "you have", "iam": "i am"
        }
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                          flags=re.IGNORECASE|re.DOTALL)

        def replace(match):
            return contractions_dict[match.group(0).lower()]

        return contractions_pattern.sub(replace, text)

    def remove_numbers(self, text):
        return re.sub(r'\d+', '', text)

    def remove_emoticons(self, text):
        return emoji.replace_emoji(text, replace='')

    def expand_abbreviations(self, text):
        abbreviations_dict = {
            "btw": "by the way", "lol": "laughing out loud", "idk": "I don't know", "omg": "oh my god",
            "brb": "be right back", "imo": "in my opinion", "smh": "shaking my head", "tbh": "to be honest"
        }
        abbreviations_pattern = re.compile('({})'.format('|'.join(abbreviations_dict.keys())), flags=re.IGNORECASE)

        def replace_abbreviation(match):
            return abbreviations_dict[match.group(0).lower()]

        return abbreviations_pattern.sub(replace_abbreviation, text)

    def remove_extra_spaces(self, text):
        return ' '.join(text.split())



data = pd.read_csv('train.csv')  


PP = Preprocess()

data['cleaned_text'] = data['text'].apply(PP.remove_non_letters)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_stop_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.normalize_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_short_words)
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: PP.remove_long_words(x, 15))
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_twitter_handles)
data['cleaned_text'] = data['cleaned_text'].apply(PP.clean_text)
data['cleaned_text'] = data['cleaned_text'].apply(PP.expand_contractions)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_numbers)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_stop_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_short_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_emoticons)  
data['cleaned_text'] = data['cleaned_text'].apply(PP.expand_abbreviations) 
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_extra_spaces)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
data.drop(["id", "keyword", "location", "text"], axis=1, inplace=True)

In [None]:

X = data['cleaned_text'] # Define X
y = data['target']
vectorizer = TfidfVectorizer(max_features=1000) 
XEncoded = vectorizer.fit_transform(X).toarray()  


K = 5
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

final_model = None  
model_weights = np.ones(len(y))  
alphas = []  

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix

for train_idx, test_idx in cv.split(XEncoded, y):
    
    X_train, X_test = XEncoded[train_idx], XEncoded[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    
    model = LogisticRegression(solver='liblinear', class_weight='balanced')
    model.fit(X_train, y_train, sample_weight=model_weights[train_idx])

    
    y_pred = model.predict(X_test)

    
    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])

    
    alpha = np.log((1 - error_rate) / (error_rate + 1e-10)) 
    alphas.append(alpha)

    
    model_weights[test_idx] *= np.exp(alpha * errors)
    
    final_model = model  


final_pred = final_model.predict(XEncoded)  
accuracy = accuracy_score(y, final_pred)





report = classification_report(y, final_pred)
print("\nClassification Report for Logistic Regression:\n", report)

conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.85      0.80      0.82      4342
           1       0.75      0.82      0.78      3271

    accuracy                           0.81      7613
   macro avg       0.80      0.81      0.80      7613
weighted avg       0.81      0.81      0.81      7613


Confusion Matrix:
 [[3461  881]
 [ 598 2673]]


In [None]:
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix

for train_idx, test_idx in cv.split(XEncoded, y):

    X_train, X_test = XEncoded[train_idx], XEncoded[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10)) 
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)

   
    final_model = model 


final_pred = final_model.predict(XEncoded)
accuracy = accuracy_score(y, final_pred)


report = classification_report(y, final_pred)
print("\nClassification Report for Random Forest:\n", report)


conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95      4342
           1       0.94      0.91      0.93      3271

    accuracy                           0.94      7613
   macro avg       0.94      0.93      0.94      7613
weighted avg       0.94      0.94      0.94      7613


Confusion Matrix:
 [[4165  177]
 [ 292 2979]]


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix

for train_idx, test_idx in cv.split(XEncoded, y):

    X_train, X_test = XEncoded[train_idx], XEncoded[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = SVC(kernel='linear', class_weight='balanced', probability=True)
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10)) 
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)

    final_model = model  


final_pred = final_model.predict(XEncoded)  
accuracy = accuracy_score(y, final_pred)


report = classification_report(y, final_pred)
print("\nClassification Report for SVC:\n", report)


conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Classification Report for SVC:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85      4342
           1       0.81      0.78      0.80      3271

    accuracy                           0.83      7613
   macro avg       0.83      0.82      0.82      7613
weighted avg       0.83      0.83      0.83      7613


Confusion Matrix:
 [[3761  581]
 [ 726 2545]]


In [None]:
!pip install nltk
import nltk
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix

nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


def get_wordnet_pos(word):
    """Get WordNet POS tag for correct lemmatization"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def calculate_sentiwordnet_score(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    score = 0
    for token in tokens:
        
        pos = get_wordnet_pos(token)
        lemma = lemmatizer.lemmatize(token, pos)

        
        synsets = list(swn.senti_synsets(lemma, pos))
        if synsets:
            sentiment = synsets[0]
            score += sentiment.pos_score() - sentiment.neg_score()

    return score


data['sentiment_score'] = data['cleaned_text'].apply(calculate_sentiwordnet_score)

X_combined = np.hstack((XEncoded, data[['sentiment_score']].values))


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

K = 5
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

final_model = None
model_weights = np.ones(len(y))  
alphas = []  


for train_idx, test_idx in cv.split(X_combined, y):

    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10))
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)


    final_model = model


final_pred = final_model.predict(X_combined)


accuracy = accuracy_score(y, final_pred)

report = classification_report(y, final_pred)
print("\nClassification Report for Random Forest Model with Sentiment Analysis:\n", report)


conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)



[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.



Classification Report for Random Forest Model with Sentiment Analysis:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      4342
           1       0.95      0.91      0.93      3271

    accuracy                           0.94      7613
   macro avg       0.94      0.94      0.94      7613
weighted avg       0.94      0.94      0.94      7613


Confusion Matrix:
 [[4189  153]
 [ 290 2981]]


In [None]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def calculate_sentiwordnet_sentiment(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)

    sentiment_score = 0
    token_count = 0

    for word, pos in pos_tags:
        wn_pos = get_wordnet_pos(pos)
        if wn_pos:
            synsets = list(swn.senti_synsets(word, wn_pos))
            if synsets:
                synset = synsets[0]  
                sentiment_score += synset.pos_score() - synset.neg_score()
                token_count += 1


    return sentiment_score / token_count if token_count > 0 else 0


data['sentiment_score'] = data['cleaned_text'].apply(calculate_sentiwordnet_sentiment)

X_combined = np.hstack((XEncoded, data[['sentiment_score']].values))
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

K = 5
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

final_model = None
model_weights = np.ones(len(y)) 
alphas = []

for train_idx, test_idx in cv.split(X_combined, y):

    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = SVC(kernel='linear', class_weight='balanced', probability=True)
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10))
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)


    final_model = model


final_pred = final_model.predict(X_combined)
accuracy = accuracy_score(y, final_pred)

report = classification_report(y, final_pred)
print("\nClassification Report for SVM Model with Sentiment Analysis:\n", report)


conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Classification Report for SVM Model with Sentiment Analysis:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85      4342
           1       0.81      0.78      0.80      3271

    accuracy                           0.83      7613
   macro avg       0.83      0.82      0.82      7613
weighted avg       0.83      0.83      0.83      7613


Confusion Matrix:
 [[3763  579]
 [ 722 2549]]


In [None]:
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from flair.models import TextClassifier
from flair.data import Sentence
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix


nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


flair_classifier = TextClassifier.load('en-sentiment')


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def calculate_sentiwordnet_sentiment(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)

    sentiment_score = 0
    token_count = 0

    for word, pos in pos_tags:
        wn_pos = get_wordnet_pos(pos)
        if wn_pos:
            synsets = list(swn.senti_synsets(word, wn_pos))
            if synsets:
                synset = synsets[0]  
                sentiment_score += synset.pos_score() - synset.neg_score()
                token_count += 1


    return sentiment_score / token_count if token_count > 0 else 0


def calculate_flair_sentiment(text):
    sentence = Sentence(text)
    flair_classifier.predict(sentence)
    score = sentence.labels[0].score
    value = 1 if sentence.labels[0].value == 'POSITIVE' else -1
    return score * value


data['sentiwordnet_sentiment'] = data['cleaned_text'].apply(calculate_sentiwordnet_sentiment)
data['flair_sentiment'] = data['cleaned_text'].apply(calculate_flair_sentiment)


tfidf = TfidfVectorizer(max_features=5000) 
XEncoded = tfidf.fit_transform(data['cleaned_text']).toarray() #


X_combined = np.hstack((XEncoded, data[['sentiwordnet_sentiment', 'flair_sentiment']].values))
y = data['target']


K = 5
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

final_model = None 
model_weights = np.ones(len(y))  
alphas = [] 


for train_idx, test_idx in cv.split(X_combined, y):

    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = LogisticRegression(solver='liblinear', class_weight='balanced')
    model.fit(X_train, y_train, sample_weight=model_weights[train_idx])


    y_pred = model.predict(X_test)


    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10))
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)


    final_model = model


final_pred = final_model.predict(X_combined)



report = classification_report(y, final_pred)
print("\nClassification Report for Logistic Regression Model with Sentiment Analysis:\n", report)


conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


2024-09-19 14:59:58,922 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmp7dqhptwb


100%|██████████| 253M/253M [00:16<00:00, 16.1MB/s]

2024-09-19 15:00:15,955 copying /tmp/tmp7dqhptwb to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2024-09-19 15:00:16,166 removing temp file /tmp/tmp7dqhptwb


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Classification Report for Logistic Regression Model with Sentiment Analysis:
               precision    recall  f1-score   support

           0       0.92      0.88      0.90      4342
           1       0.85      0.90      0.88      3271

    accuracy                           0.89      7613
   macro avg       0.89      0.89      0.89      7613
weighted avg       0.89      0.89      0.89      7613


Confusion Matrix:
 [[3830  512]
 [ 328 2943]]


In [None]:
!pip install text2emotion
!pip install nrclex
!pip install sklearn

Collecting text2emotion
  Downloading text2emotion-0.0.5-py3-none-any.whl.metadata (3.1 kB)
Downloading text2emotion-0.0.5-py3-none-any.whl (57 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: text2emotion
Successfully installed text2emotion-0.0.5
Collecting nrclex
  Downloading NRCLex-4.0-py3-none-any.whl.metadata (3.2 kB)
INFO: pip is looking at multiple versions of nrclex to determine which version is compatible with other requirements. This could take a while.
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nrclex
  Building wheel for nrclex (setup.py

In [None]:

import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix


from flair.models import TextClassifier
from flair.data import Sentence


classifier = TextClassifier.load('en-sentiment')


def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def calculate_sentiwordnet_score(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    score = 0
    for token in tokens:
        pos = get_wordnet_pos(token)
        lemma = lemmatizer.lemmatize(token, pos)
        synsets = list(swn.senti_synsets(lemma, pos))
        if synsets:
            sentiment = synsets[0]
            score += sentiment.pos_score() - sentiment.neg_score()
    return score


def calculate_flair_sentiment(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    label = sentence.labels[0]
    return label.score if label.value == 'POSITIVE' else -label.score


data['sentiwordnet_score'] = data['cleaned_text'].apply(calculate_sentiwordnet_score)
data['flair_sentiment_score'] = data['cleaned_text'].apply(calculate_flair_sentiment)


X_combined = np.hstack((XEncoded, data[['sentiwordnet_score', 'flair_sentiment_score']].values))


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

K = 5
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

final_model = None
model_weights = np.ones(len(y))  
alphas = []  


for train_idx, test_idx in cv.split(X_combined, y):

    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10))
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)


    final_model = model


final_pred = final_model.predict(X_combined)


report = classification_report(y, final_pred)
print("\nClassification Report for Random Forest with Sentiment Analysis:\n", report)

conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Classification Report for Random Forest with Sentiment Analysis:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96      4342
           1       0.97      0.93      0.95      3271

    accuracy                           0.96      7613
   macro avg       0.96      0.95      0.95      7613
weighted avg       0.96      0.96      0.96      7613


Confusion Matrix:
 [[4240  102]
 [ 235 3036]]


In [None]:

from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from flair.models import TextClassifier
from flair.data import Sentence
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


classifier = TextClassifier.load('en-sentiment')


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def calculate_sentiwordnet_sentiment(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)

    sentiment_score = 0
    token_count = 0

    for word, pos in pos_tags:
        wn_pos = get_wordnet_pos(pos)
        if wn_pos:
            synsets = list(swn.senti_synsets(word, wn_pos))
            if synsets:
                synset = synsets[0]  
                sentiment_score += synset.pos_score() - synset.neg_score()
                token_count += 1


    return sentiment_score / token_count if token_count > 0 else 0


def calculate_flair_sentiment(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    label = sentence.labels[0]
    return label.score if label.value == 'POSITIVE' else -label.score


data['sentiwordnet_score'] = data['cleaned_text'].apply(calculate_sentiwordnet_sentiment)
data['flair_sentiment_score'] = data['cleaned_text'].apply(calculate_flair_sentiment)


X_combined = np.hstack((XEncoded, data[['sentiwordnet_score', 'flair_sentiment_score']].values))


from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

K = 5
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

final_model = None
model_weights = np.ones(len(y))  
alphas = []

for train_idx, test_idx in cv.split(X_combined, y):

    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    model = SVC(kernel='linear', class_weight='balanced', probability=True)
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)

    errors = (y_pred != y_test).astype(int)
    error_rate = np.average(errors, weights=model_weights[test_idx])


    alpha = np.log((1 - error_rate) / (error_rate + 1e-10))
    alphas.append(alpha)


    model_weights[test_idx] *= np.exp(alpha * errors)


    final_model = model


final_pred = final_model.predict(X_combined)
accuracy = accuracy_score(y, final_pred)
report = classification_report(y, final_pred)
print("\nClassification Report SVC Model with Sentiment Analysis:\n", report)


conf_matrix = confusion_matrix(y, final_pred)
print("\nConfusion Matrix:\n", conf_matrix)

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Classification Report SVC Model with Sentiment Analysis:
               precision    recall  f1-score   support

           0       0.88      0.92      0.90      4342
           1       0.88      0.83      0.86      3271

    accuracy                           0.88      7613
   macro avg       0.88      0.87      0.88      7613
weighted avg       0.88      0.88      0.88      7613


Confusion Matrix:
 [[3979  363]
 [ 549 2722]]
