<a href="https://colab.research.google.com/github/imargipatel/Explainable-Misinformation-Detector/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation and Setup


In [None]:
#Core NLP Linraries
!pip install transformers torch scikit-learn scipy
!pip install spacy textblob
!python -m spacy download en_core_web_sm

In [None]:
# Data handling
!pip install pandas numpy -q
!pip install nltk -q

In [None]:
# Visualization (optional, for later)
!pip install matplotlib seaborn plotly -q

In [None]:
# Download FEVER dataset
!pip install requests

In [None]:
#Standard libraries
import json
import re
from collections import Counter
import numpy as np
import pandas as pd

In [None]:
# NLP libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk.corpus
import spacy
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger_eng', quiet=True)


In [None]:
# Machine Learning
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('universal_tagset', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True) # Added to fix the LookupError

In [None]:
#Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
#Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

In [None]:
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

Loading FEVER Dataset

In [None]:
!pip install datasets==2.19.0

In [None]:
from datasets import load_dataset
fever = load_dataset("fever", "v1.0", trust_remote_code=True)
print(fever)



In [None]:
#Extract splits
train_data = fever['train']
dev_data = fever['paper_dev']
test_data = fever['paper_test']

In [None]:
#Convert to pandas for easier handling
import pandas as pd
train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)

print(f"Training: {len(train_df)} | Dev: {len(dev_df)} | Test: {len(test_df)}")
print(f"Columns: {train_df.columns.tolist()}")

In [None]:
print(f"Training set: {len(train_df)} claims")
print(f"Dev set: {len(dev_df)} claims")
print(f"Test set: {len(test_df)} claims")
print(f"Total: {len(train_df) + len(dev_df) + len(test_df)} claims")

In [None]:
print(f"\n Dataset columns: {train_df.columns.tolist()}")
print(f"\n Sample row stucture: {train_df.iloc[0]}")


Data Cleaning and Preprocessing

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""

    #Convert to lowercase
    text = text.lower()

    #Remove extra whitespace
    text = ' '.join(text.split())

    #Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    #Remove special characters but keep necessary punctuation
    text = re.sub(r'[^\w\s\.\!\?ulations\-\'\"\;]', '', text)

    #Remove multiple punctuation
    text = re.sub(r'\.{2,}|!{2,}|\?{2,}', lambda m: m.group(0)[0], text)

    #Remove extra spaces
    text = ' '.join(text.split())

    return text.strip()

In [None]:
from logging import critical
def preprocess_dataframe(df, name="Dataset"):
    """Apply all preprocessing steps to a dataframe"""
    df = df.copy()

    print(f"\nProcessing {name}:")
    initial_count = len(df)

    # Drop rows with missing critical columns
    critical_cols = ['claim', 'label'] if 'label' in df.columns else ['claim']
    df = df.dropna(subset=critical_cols)
    dropped = initial_count - len(df)
    if dropped > 0:
        print(f"  • Dropped {dropped} rows with missing values")

    # Remove duplicates
    dup_count = df.duplicated(subset=['claim']).sum()
    df = df.drop_duplicates(subset=['claim'], keep='first')
    if dup_count > 0:
        print(f"  • Removed {dup_count} duplicate claims")

    # Clean claims
    df['claim_clean'] = df['claim'].apply(clean_text)

    # Remove empty claims
    empty_count = (df['claim_clean'].str.len() == 0).sum()
    df = df[df['claim_clean'].str.len() > 0]
    if empty_count > 0:
        print(f"  • Removed {empty_count} empty claims after cleaning")

    # Add basic text features
    df['claim_length'] = df['claim_clean'].str.split().str.len()
    df['claim_char_length'] = df['claim_clean'].str.len()
    df['num_sentences'] = df['claim_clean'].apply(lambda x: len(sent_tokenize(x)))

    print(f"  Final size: {len(df)} claims")
    print(f"  Claim length - Mean: {df['claim_length'].mean():.2f}, Median: {df['claim_length'].median():.0f}")

    return df

In [None]:
#Preprocess all the datasets
train_df = preprocess_dataframe(train_df, "Train Set")
dev_df = preprocess_dataframe(dev_df, "Dev Set")
test_df = preprocess_dataframe(test_df, "Test Set")

Exploratory Data Analysis (EDA)

In [None]:
# Label distribution
print("\nLABEL DISTRIBUTION")
for name, df in [("Train", train_df), ("Dev", dev_df)]:
    print(f"\n{name} Set:")
    label_dist = df['label'].value_counts().sort_index()
    for label, count in label_dist.items():
        pct = (count / len(df)) * 100
        label_names = {0: "SUPPORTED", 1: "REFUTED", 2: "NOT_ENOUGH_INFO"}
        print(f"  • {label_names.get(label, label)}: {count:,} ({pct:.1f}%)")

In [None]:
# Claim statistics
print("\nCLAIM STATISTICS")
for name, df in [("Train", train_df), ("Dev", dev_df), ("Test", test_df)]:
    print(f"\n{name} Set:")
    print(f"  • Claim length (words):")
    print(f"    - Mean: {df['claim_length'].mean():.2f}")
    print(f"    - Median: {df['claim_length'].median():.0f}")
    print(f"    - Std: {df['claim_length'].std():.2f}")
    print(f"    - Min-Max: {df['claim_length'].min():.0f}-{df['claim_length'].max():.0f}")
    print(f"  Number of sentences - Mean: {df['num_sentences'].mean():.2f}")

In [None]:

# Missing values check
print("\n MISSING VALUES CHECK ")
for name, df in [("Train", train_df), ("Dev", dev_df), ("Test", test_df)]:
    print(f"\n{name} Set:")
    missing = df.isnull().sum()
    if missing.sum() == 0:
        print("No missing values")
    else:
        for col, count in missing[missing > 0].items():
            print(f"  • {col}: {count} ({count/len(df)*100:.2f}%)")

In [None]:
# Sample claims
print("\nSample Claims by Label:")
for label in train_df['label'].unique():
    sample = train_df[train_df['label'] == label]['claim'].iloc[0]
    print(f"Label {label}: {sample}")

Visualization


In [None]:
#Label Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
label_names = {0: "SUPPORTED", 1: "REFUTED", 2: "NOT_ENOUGH_INFO"}

for idx, (name, df) in enumerate([("Train", train_df), ("Dev", dev_df)]):
    counts = df['label'].value_counts().sort_index()
    axes[idx].bar(counts.index, counts.values, color=['#2ecc71', '#e74c3c', '#95a5a6'])
    axes[idx].set_title(f'{name} - Label Distribution')
    axes[idx].set_ylabel('Count')

plt.tight_layout()
plt.savefig('01_labels.png', dpi=150)
plt.show()

In [None]:
#Claim Length Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for idx, (name, df) in enumerate([("Train", train_df), ("Dev", dev_df)]):
    df['claim_length'] = df['claim_clean'].str.split().str.len()
    axes[idx].hist(df['claim_length'], bins=30, color='#3498db', edgecolor='black')
    axes[idx].axvline(df['claim_length'].mean(), color='red', linestyle='--', label=f'Mean: {df["claim_length"].mean():.1f}')
    axes[idx].set_title(f'{name} - Claim Length')
    axes[idx].set_xlabel('Words')
    axes[idx].legend()

plt.tight_layout()
plt.savefig('02_length.png', dpi=150)
plt.show()

Extracted Linguistic Features

In [None]:
from nltk.tbl import feature
from textblob.en import polarity
def extract_features(text):

  if not text or pd.isna(text):
    return {f: 0 for f in ['num_words', 'num_sentences','num_adj','num_adv','num_modal','polarity','subjectivity']}

  text = str(text)
  words = word_tokenize(text)
  pos_tags = nltk.pos_tag(words, tagset='universal')

  #Count Features
  num_words = len(words)
  num_sentences = len(sent_tokenize(text))
  num_adj = sum(1 for _, tag in pos_tags if tag == 'ADJ')
  num_adv = sum(1 for _, tag in pos_tags if tag == 'ADV')

  #Modal Verbs
  modals = {'can', 'could', 'may', 'might', 'must', 'should','will', 'would'}
  num_modal = sum(1 for word in words if word.lower() in modals)

  #Sentiment
  blob = TextBlob(text)
  polarity = blob.sentiment.polarity
  subjectivity = blob.sentiment.subjectivity

  return {
      'num_words': num_words,
      'num_sentences': num_sentences,
      'num_adj': num_adj,
      'num_adv': num_adv,
      'num_modal': num_modal,
      'polarity': polarity,
      'subjectivity': subjectivity
  }

In [None]:
# Extraxt from sample
print("Extracting features from sample:")

sample_size = min(2000, len(train_df))
features_list =[]
for i in range(sample_size):
    claim_text = train_df.iloc[i]['claim_clean']
    claim_label = train_df.iloc[i]['label'] # Get the label from train_df
    feat = extract_features(claim_text)
    feat['label'] = claim_label # Add label to the feature dictionary
    features_list.append(feat)

features_df = pd.DataFrame(features_list)
print(f"Features extracted from {len(features_df)} claims")
print(f"\n Features Statistics:")
print(features_df.describe())

These bar charts will display the average value of each linguistic feature for each claim label (SUPPORTS, REFUTES, NOT ENOUGH INFO), along with error bars to show the variability around that average.

This provides a very clear and intuitive way to compare how these features differ across the claim types.

Comparing Features by Label: These plots clearly show the average value of each feature (like num_words, polarity, subjectivity, etc.) for 'SUPPORTS', 'REFUTES', and 'NOT ENOUGH INFO' claims.

Ease of Interpretation: This format makes it very straightforward to see at a glance if one label typically has, for example, more words, higher subjectivity, or a different sentiment than another. The error bars (showing standard deviation) give you an idea of how much variation there is around that average.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Map original labels to more descriptive names for plotting clarity
plot_label_names = {"SUPPORTS": "SUPPORTS", "REFUTES": "REFUTES", "NOT ENOUGH INFO": "NOT ENOUGH INFO"}
features_df['label_display'] = features_df['label'].map(plot_label_names)

features_to_plot = ['num_words', 'num_sentences', 'num_adj', 'num_adv', 'num_modal', 'polarity', 'subjectivity']

fig, axes = plt.subplots(2, 4, figsize=(20, 10)) # Adjusted subplot grid
axes = axes.flatten()

for idx, feature in enumerate(features_to_plot):
    if idx < len(axes):
        sns.barplot(x='label_display', y=feature, data=features_df, ax=axes[idx],
                    palette='viridis', errorbar='sd', capsize=0.1) # 'sd' for standard deviation error bars

        axes[idx].set_title(f'Mean {feature.replace("_", " ").title()} by Label', fontsize=12)
        axes[idx].set_xlabel('Claim Label', fontsize=10)
        axes[idx].set_ylabel(f'Mean {feature.replace("_", " ").title()}', fontsize=10)
        axes[idx].tick_params(axis='x', rotation=30)
        axes[idx].grid(axis='y', alpha=0.3)

# Hide any unused subplots if features_to_plot is smaller than grid size
for i in range(len(features_to_plot), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.suptitle('Average Linguistic Features Across Claim Labels', y=1.02, fontsize=16, fontweight='bold')
plt.savefig('04_mean_features_bar_chart.png', dpi=150, bbox_inches='tight')
plt.show()

print("Saved: 04_mean_features_bar_chart.png")

Vocabulary Analysis

In [None]:
all_words = []
for claim in train_df['claim_clean']:
  words = [w.lower() for w in word_tokenize(claim) if w.isalpha()]
  all_words.extend(words)

word_counts = Counter(all_words)
print(f"Total words: {len(all_words):,}")
print(f"Unique words: {len(word_counts):,}")
print(f"Type-Token Ratio: {len(word_counts)/len(all_words):.4f}")
print(f"\nTop 15 words:")
for word, count in word_counts.most_common(15):
  print(f"{word}: {count:,}")



In [None]:
#Visualize top words
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

top_words_train = dict(word_counts.most_common(20))
axes[0].barh(list(top_words_train.keys()), list(top_words_train.values()), color='#e74c3c')
axes[0].set_title('Top 20 Words - Train')
axes[0].invert_yaxis()

# Dev set
all_words_dev = [w.lower() for claim in dev_df['claim_clean'] for w in word_tokenize(claim) if w.isalpha()]
word_counts_dev = Counter(all_words_dev)
top_words_dev = dict(word_counts_dev.most_common(20))
axes[1].barh(list(top_words_dev.keys()), list(top_words_dev.values()), color='#3498db')
axes[1].set_title('Top 20 Words - Dev')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('04_vocabulary.png', dpi=150)
plt.show()

Statistical Analysis (SUPPORTED vs REFUTED)

In [None]:
from scipy import stats

features_to_test =['num_adj','num_adv','num_modal','polarity','subjectivity']

results = []
for feature in features_to_test:
  supported = features_df[features_df['label'] == 'SUPPORTS'][feature] # Filter by string label
  refuted = features_df[features_df['label'] =='REFUTES'][feature] # Filter by string label

  # Check if both series are not empty before performing t-test
  if not supported.empty and not refuted.empty:
    t_stat, p_value = stats.ttest_ind(supported, refuted, equal_var=False) # Use Welch's t-test if variances are unequal
    diff_pct = ((refuted.mean() - supported.mean()) / supported.mean() * 100 ) if supported.mean() != 0 else 0

    print(f"\n{feature}")
    print(f" SUPPORTED: {supported.mean():.3f}")
    print(f" REFUTED: {refuted.mean():.3f}")
    print(f" Difference: {diff_pct:+.1f}%")
    print(f" p-value: {p_value:.2e} {'Significant' if p_value < 0.05 else 'Not Significant'}")

    results.append({
        'Feature': feature,
        'SUPPORTED MEAN': supported.mean(),
        'REFUTED MEAN': refuted.mean(),
        'DIFFERENCE': diff_pct,
        'p-value': p_value,
        'Significant': 'Yes' if p_value <0.05 else 'No'
    })
  else:
    print(f"\n{feature}: Not enough data for comparison (one or both labels are empty).")

stats_df = pd.DataFrame(results)

In [None]:
train_df.to_csv('train_processed.csv', index=False)
dev_df.to_csv('dev_processed.csv', index=False)
test_df.to_csv('test_processed.csv', index=False)
features_df.to_csv('linguistic_features.csv', index=False)
stats_df.to_csv('linguistic_stats.csv', index=False)

In [None]:
!pip install nbstripout



In [None]:
!nbstripout NLP_Project.ipynb
