In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from tensorflow import keras
from keras import layers,regularizers
from keras.layers import BatchNormalization, Dropout
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score,roc_curve,auc,roc_auc_score
import emoji
from gensim.models import Word2Vec

In [None]:
df_train=pd.read_csv("twitter_training.csv",header=None)
df_train=df_train.dropna()
df_train[2]=df_train[2].map({"Positive":1,"Negative":0,"Neutral":2,"Irrelevant":3})
df_train=df_train.rename(columns={3:"text",2:"sentiment",1:"category",0:"id"})

df_test=pd.read_csv("twitter_validation.csv",header=None)
df_test=df_test.dropna()
df_test[2]=df_test[2].map({"Positive":1,"Negative":0,"Neutral":2,"Irrelevant":3})
df_test=df_test.rename(columns={3:"text",2:"sentiment",1:"category",0:"id"})

In [None]:
def removeURL(text):
    return re.sub(r'http\S+|www.\S+', '', text)

def removeMentions(text):
    return re.sub(r'@\w+', '', text)

def hashtagHandling(text):
    return re.sub(r'#', '', text)

def decodeEmoji(text):
    return emoji.demojize(text, delimiters=(" ", " "))

In [None]:
def cleanTweets(text):    
    text = text.lower()
    text = removeURL(text)
    text = removeMentions(text)
    text = hashtagHandling(text)
    text = decodeEmoji(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # keep only letters & spaces
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

In [None]:
def one_hot_encode(df, column_name):
    from sklearn.preprocessing import OneHotEncoder

    try:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    except TypeError:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

    encoded_array = encoder.fit_transform(df[[column_name]])

    try:
        feature_names = encoder.get_feature_names_out([column_name])
    except AttributeError:
        feature_names = encoder.get_feature_names([column_name])

    encoded_df = pd.DataFrame(encoded_array, columns=feature_names, index=df.index)

    df= pd.concat([df.drop(columns=[column_name]), encoded_df], axis=1)
    return df

In [None]:
def text_preprocessing(df,column_name):
    vectorizer=TfidfVectorizer(max_features=5000,stop_words="english",ngram_range=(1,2))
    Encoded_Text=vectorizer.fit_transform(df[column_name]).toarray()
    return Encoded_Text

In [None]:
df_train=one_hot_encode(df_train,"category")
df_train["text"]=df_train["text"].apply(cleanTweets)

df_test=one_hot_encode(df_test,"category")
df_test["text"]=df_test["text"].apply(cleanTweets)

In [None]:
def create_word2vec_model(sentences):
    tokenized_sentences = [sentence.split() for sentence in sentences]
    model = Word2Vec(tokenized_sentences, 
                    vector_size=300,  # Dimension of word vectors
                    window=5,         # Context window size
                    min_count=1,      # Minimum word frequency
                    sg=1,             # Skip-gram model
                    workers=4)        # Number of threads
    return model

def text_to_vectors(texts, model):
    vectors = []
    for text in texts:
        words = text.split()
        # Get vectors for words that exist in the model's vocabulary
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if word_vectors:
            sentence_vector = np.mean(word_vectors, axis=0)
        else:
            sentence_vector = np.zeros(model.vector_size)
        vectors.append(sentence_vector)
    return np.array(vectors)

w2v_model = create_word2vec_model(df_train['text'])

encoded_text_train= text_to_vectors(df_train['text'], w2v_model)
encoded_text_test= text_to_vectors(df_test['text'], w2v_model)

print("Training vectors shape:", encoded_text_train.shape)
print("Testing vectors shape:", encoded_text_test.shape)

In [None]:
X=np.array(df_train.drop(columns=["sentiment","id","text"]))
X_train=np.concatenate((X,encoded_text_train),axis=1)
y_train=np.array(df_train["sentiment"])

X=np.array(df_test.drop(columns=["sentiment","id","text"]))
X_test=np.concatenate((X,encoded_text_test),axis=1)
y_test=np.array(df_test["sentiment"])

In [None]:
modelrf=RandomForestClassifier(n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1 
)
modelrf.fit(X_train,y_train)
predictions_train=modelrf.predict(X_train)
predictions_test=modelrf.predict(X_test)
print(accuracy_score(y_train,predictions_train)," ",accuracy_score(y_test,predictions_test))

In [None]:
model=keras.Sequential(
    [
        keras.Input(shape=(332,)),
        layers.Dense(128,activation='relu',),
        BatchNormalization(),
        layers.Dense(64,activation='relu'),
        BatchNormalization(),
        layers.Dense(32,activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        layers.Dense(4, activation='softmax'),
    ]
)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    metrics=["accuracy"],
)

model.fit(X_train,y_train,batch_size=32,epochs=25,verbose=0)
# model.evaluate(X_train,y_train,batch_size=64,verbose=2,epochs=1)
predictions_train=np.argmax(model.predict(X_train),axis=1)
predProb_test=model.predict(X_test)
predictions_test=np.argmax(predProb_test,axis=1)
print(accuracy_score(y_train,predictions_train)," ",accuracy_score(y_test,predictions_test))

In [38]:
cm_train=confusion_matrix(y_train,predictions_train)
cr_train=classification_report(y_train,predictions_train)

cm_test=confusion_matrix(y_test,predictions_test)
cr_test=classification_report(y_test,predictions_test)

In [None]:
def per_class_auc(y_true, y_pred, classes):
    y_true_bin = label_binarize(y_true, classes=classes)
    aucs = np.array([roc_auc_score(y_true_bin[:, i], y_pred[:, i]) for i in range(len(classes))])
    return np.round(aucs,3)

In [None]:
from itertools import cycle
from sklearn.metrics import auc

def plot_roc_curves(y_true, y_score, classes, title):
    y_true_bin = label_binarize(y_true, classes=classes)
    plt.figure(figsize=(6,4))
    for i, color in zip(range(len(classes)), cycle(['aqua','darkorange','cornflowerblue','green'])):
        fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color=color, lw=2, label=f'Class {classes[i]} (AUC = {roc_auc:.3f})')
    plt.plot([0,1], [0,1], 'k--', lw=1)
    plt.xlim([0.0, 1.0]); plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

In [None]:
aucs=np.array(per_class_auc(y_test,modelrf.predict_proba(X_test),classes=[0,1,2,3]))
print(f"Random Forest AUCs: {aucs}")
plot_roc_curves(y_test,modelrf.predict_proba(X_test),classes=[0,1,2,3],title="Random Forest ROC- Test")

aucs=np.array(per_class_auc(y_test,predProb_test,classes=[0,1,2,3]))
print(f"Neural Network AUCs: {aucs}")
plot_roc_curves(y_test,predProb_test,classes=[0,1,2,3],title="Neural Network ROC- Test")
