In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
df1 = pd.read_csv('/content/train_data part-a.txt', sep='\t')

print(df1.head())
print("--------------------------------------------------")
print("the shape of the dataset is :", df1.shape)

In [None]:
df2 = pd.read_csv('/content/train_data part-b.txt', sep=None, engine='python', encoding='latin-1')

print(df2.head())
print("--------------------------------------------------")
print("the shape of the dataset is :", df2.shape)

In [None]:
data=pd.concat([df1,df2],axis=0)

print(data.head())

print("--------------------------------------------------")

print("Shape of the DataFrame:", data.shape)

In [None]:
data.info()

In [None]:
print("The null values in the dataframe:")
print(data.isnull().sum())

In [None]:
data.duplicated().sum()
print("\nNumber of duplicates:",data.duplicated().sum())

In [None]:
data['Target'].value_counts()

In [None]:
data['Stance'].value_counts()

In [None]:
import plotly.express as px

target_counts = data['Stance'].value_counts().reset_index()
target_counts.columns = ['Stance', 'count']

fig = px.pie(target_counts, names='Stance', values='count',
             title="Distribution of Stance Categories",
             color_discrete_sequence=px.colors.sequential.Plasma)

fig.show()

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(Tweet):
    text = Tweet
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    cleaned_text = ' '.join(lemmatized_tokens)

    return tokens, lemmatized_tokens, cleaned_text

data[['tokenized_tweet', 'lemmatized_tweet', 'cleaned_tweet']] = data['Tweet'].apply(lambda x: pd.Series(preprocess_text(x)))

data.head()

In [None]:
from IPython.display import display

display(data[['Tweet', 'cleaned_tweet']].sample(10))

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['text_no_stopwords'] = data['cleaned_tweet'].apply(remove_stopwords)

display(data[['cleaned_tweet', 'text_no_stopwords']].sample(10))

In [None]:
data. head()

In [None]:
from collections import Counter

all_text = ' '.join(data['text_no_stopwords'])

word_freq = Counter(all_text.split())

print(word_freq.most_common(10))

common_words = word_freq.most_common(10)

words, counts = zip(*common_words)

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x=list(counts), y=list(words), marker='o', color='b', linestyle='-')

plt.xlabel("Frequency")
plt.ylabel("Words")
plt.title("Top 10 Most Common Words (After Stopword Removal)")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

def plot_word_cloud(text, title):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=set(STOPWORDS),
        max_words=200,
        max_font_size=40,
        scale=3,
        random_state=42
    ).generate(text)

    plt.figure(figsize=(8, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)
    plt.show()

unique_stances = data['Stance'].unique()

for stance in unique_stances:
    subset = data[data['Stance'] == stance]
    text = ' '.join(subset['cleaned_tweet'])
    plot_word_cloud(text, f'Word Cloud for Stance: {stance}')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

corpus = data['text_no_stopwords'].dropna().tolist()

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

count_vectorizer = CountVectorizer(max_features=1000)
count_matrix = count_vectorizer.fit_transform(corpus)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

print("TF-IDF Representation:")
display(tfidf_df.head())

print("\nCount Vectorization Representation:")
display(count_df.head())

In [None]:
from sklearn.preprocessing import LabelEncoder

data[['Target', 'Stance']] = data[['Target', 'Stance']].apply(LabelEncoder().fit_transform)

In [None]:
X = data['cleaned_tweet']
y = data['Stance']

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
from nltk.stem import WordNetLemmatizer
from fastai.text.all import *
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

In [None]:
from sklearn.metrics import accuracy_score

dls = TextDataLoaders.from_df(data, text_col='cleaned_tweet', label_col='Stance', bs=32)
learn = text_classifier_learner(dls, AWD_LSTM, metrics=accuracy)
learn.fine_tune(4)

y_pred = learn.get_preds(dl=dls.test_dl(X_test))[0]

accuracy = accuracy_score(y_test, y_pred.argmax(dim=1))
print("ULMFiT Test Accuracy:", accuracy)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=50)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=50)

lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=50),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = lstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=32, validation_data=(X_test_seq, y_test))

train_accuracy = history.history['accuracy'][-1]
test_accuracy = history.history['val_accuracy'][-1]

print(f"Final training accuracy: {train_accuracy:.4f}")
print(f"Final test accuracy: {test_accuracy:.4f}")


In [None]:
from tensorflow.keras.layers import Bidirectional

bilstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=50),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = bilstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=32, validation_data=(X_test_seq, y_test))

train_accuracy = history.history['accuracy'][-1]
test_accuracy = history.history['val_accuracy'][-1]

print(f"Final training accuracy: {train_accuracy:.4f}")
print(f"Final test accuracy: {test_accuracy:.4f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import BartTokenizer, BartForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset

bart_model_name = "facebook/bart-large-mnli"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForSequenceClassification.from_pretrained(bart_model_name)
bart_model.eval()

roberta_model_name = "roberta-large-mnli"
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)
roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_model_name)
roberta_model.eval()

class StanceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

test_dataset = StanceDataset(X_test, y_test, bart_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

def get_predictions(model, dataloader, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

bart_preds, bart_actuals = get_predictions(bart_model, test_loader)

roberta_preds, roberta_actuals = get_predictions(roberta_model, test_loader)

bart_accuracy = accuracy_score(bart_actuals, bart_preds)
print("============**********************************=====================")
print(f"BART Accuracy: {bart_accuracy:.4f}")

roberta_accuracy = accuracy_score(roberta_actuals, roberta_preds)
print("============**********************************=====================")
print(f"RoBERTa Accuracy: {roberta_accuracy:.4f}")


In [None]:
from sklearn.metrics import classification_report

ulmfit_y_pred = y_pred.argmax(dim=1).numpy()
print("\nULMFiT Classification Report:\n", classification_report(y_test, ulmfit_y_pred))

lstm_y_pred = (lstm_model.predict(X_test_seq) > 0.5).astype("int32")
print("\nLSTM Classification Report:\n", classification_report(y_test, lstm_y_pred))

bilstm_y_pred = (bilstm_model.predict(X_test_seq) > 0.5).astype("int32")
print("\nBiLSTM Classification Report:\n", classification_report(y_test, bilstm_y_pred))

print("\nBART Classification Report:\n", classification_report(bart_actuals, bart_preds))

print("\nRoBERTa Classification Report:\n", classification_report(roberta_actuals, roberta_preds))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

ulmfit_report = classification_report(y_test, ulmfit_y_pred, output_dict=True)
lstm_report = classification_report(y_test, lstm_y_pred, output_dict=True)
bilstm_report = classification_report(y_test, bilstm_y_pred, output_dict=True)
bart_report = classification_report(bart_actuals, bart_preds, output_dict=True)
roberta_report = classification_report(roberta_actuals, roberta_preds, output_dict=True)

comparison_data = {
    "Model": ["ULMFiT", "LSTM", "BiLSTM", "BART", "RoBERTa"],
    "Precision": [
        ulmfit_report["weighted avg"]["precision"],
        lstm_report["weighted avg"]["precision"],
        bilstm_report["weighted avg"]["precision"],
        bart_report["weighted avg"]["precision"],
        roberta_report["weighted avg"]["precision"]
    ],
    "Recall": [
        ulmfit_report["weighted avg"]["recall"],
        lstm_report["weighted avg"]["recall"],
        bilstm_report["weighted avg"]["recall"],
        bart_report["weighted avg"]["recall"],
        roberta_report["weighted avg"]["recall"]
    ],
    "F1-Score": [
        ulmfit_report["weighted avg"]["f1-score"],
        lstm_report["weighted avg"]["f1-score"],
        bilstm_report["weighted avg"]["f1-score"],
        bart_report["weighted avg"]["f1-score"],
        roberta_report["weighted avg"]["f1-score"]
    ]
}

comparison_df = pd.DataFrame(comparison_data)

print("\nModel Performance Comparison Table:")
print(comparison_df.to_string(index=False))

In [None]:
sns.set(style="whitegrid")

comparison_melted = pd.melt(comparison_df, id_vars=["Model"], var_name="Metric", value_name="Score")

plt.figure(figsize=(10, 6))
sns.barplot(x="Model", y="Score", hue="Metric", data=comparison_melted, palette="coolwarm")

plt.title("Model Comparison: Precision, Recall, and F1-Score", fontsize=14)
plt.xlabel("Model", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.ylim(0, 1)
plt.legend(title="Metric")
plt.show()