<a href="https://colab.research.google.com/github/kalamkaar9404/InfoCrucible/blob/main/backend/main_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install azure-cosmos azure-storage-blob azure-identity



In [None]:
!pip install python-dotenv



In [None]:
# Import standard libraries
import os, io
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Azure Cosmos DB client
from azure.cosmos import CosmosClient

# Azure Blob Storage client
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient

# NLP libraries
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

# Deep learning libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Input, concatenate
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image

import torch
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification, Trainer, TrainingArguments

# Ensure NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from dotenv import dotenv_values


config    = dotenv_values("/content/.env")
COSMOS_URL = config["COSMOS_ENDPOINT"]
COSMOS_KEY = config["COSMOS_KEY"]
DATABASE_NAME = config["COSMOS_DATABASE"]
CONTAINER_NAME = config["COSMOS_CONTAINER"]

# Connect to Cosmos DB
cosmos_client = CosmosClient(COSMOS_URL, credential=COSMOS_KEY)
database = cosmos_client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)

# Query all items (select relevant fields)
query = "SELECT c.id, c.statement, c.image, c.web, c.category, c.date, c.label FROM c"
items = list(container.query_items(
    query=query,
    enable_cross_partition_query=True
))
df = pd.DataFrame(items)
print(f"Loaded {len(df)} records from Cosmos DB.")
df.head()

Loaded 26599 records from Cosmos DB.


Unnamed: 0,id
0,0
1,1
2,3
3,4
4,5


In [None]:
query = "SELECT c.Statement, c.image_path FROM c"
items = list(container.query_items(query=query, enable_cross_partition_query=True))

In [None]:
statements = []
image_paths = []
for item in items:
    statements.append(item.get("Statement", ""))
    image_paths.append(item.get("image_path", ""))

In [None]:
#Text Preprocessing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string as st
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

In [None]:
# Remove punctuations

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [None]:
no_punc = [ remove_punct(s) for s in statements ]

In [None]:
#convert to lower case chunks(tokens)
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [None]:
tokenized = [ tokenize(s) for s in no_punc ]

In [None]:
# remove stopwords
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]
cleaned = [ remove_stopwords(toks) for toks in tokenized ]

In [None]:
# Apply stemming to get root words
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]


In [None]:
stemmed_words = [stemming(text) for text in cleaned]

In [None]:
# lemmatization
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatized_words = [lemmatize(text) for text in stemmed_words]

In [None]:
query = "SELECT c.Label FROM c"
items = list(container.query_items(query=query, enable_cross_partition_query=True))
labels = [record['Label'] for record in items]  # assuming your query result is in `items`

# Now convert to binary:
binary_labels = [1 if label == 'real' else 0 for label in labels]

print(binary_labels)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
sentences=[return_sentences(text) for text in lemmatized_words]

In [None]:
for i, toks in enumerate(sentences):
    print(f"Statement #{i}: {toks}")

Statement #0: prais india aarogya setu app say help identifi covid19 cluster
Statement #1: delhi deputi u secretari state stephen biegun pitch pax indopacifica
Statement #2: india sign 250 document space cooper 59 countri isro chief
Statement #3: tamil nadu chief minist mother pas away 93
Statement #4: bihar assembl elect 2020 tej pratap shift mahua hasanpur 
Statement #5: hathra case cbi reach victim villag visit crime scene
Statement #6: rajasthan crime news karauli anoth elderli beaten death sikar five youth custodi
Statement #7: mumbai bmc book penalis peopl step without face mask
Statement #8: covid19 india singleday spike drop 55342 talli approach 72 lakh
Statement #9: amid stubbl burn delhi air qualiti deterior poor
Statement #10: bihar assembl elect bjp expel nine rebel contest elect nda candid
Statement #11: pm modi releas balasaheb vikh patil autobiographi
Statement #12: post offic recruit 2020 big vacanc 1371 post 10th pas check elig pay scale
Statement #13: tamil nadu covid

In [None]:
#Images Preprocessing
import pandas as pd
import matplotlib.pyplot as plt

import requests
from PIL import Image
from io import BytesIO
import os
import urllib.request

import torchvision.models as models
import torch.nn as nn
import torch
from torchvision import transforms
from PIL import ImageOps

In [None]:
model = models.efficientnet_b3(weights='EfficientNet_B3_Weights.IMAGENET1K_V1')

In [None]:
model.classifier = nn.Sequential (
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1536, out_features=1000, bias=True),
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1000, out_features=768, bias=True)
)

In [None]:
# 1. Connect to Azure Blob Storage
from azure.storage.blob import BlobServiceClient
import io
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from torchvision.transforms import InterpolationMode
from dotenv import dotenv_values
connection_string = config["STORAGE_CONN_STR"]
container_name = config["STORAGE_CONTAINER"]

# Create client to interact with the container
service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = service_client.get_container_client(container_name)

# 2. Define image preprocessing for EfficientNet-B3
preprocess = transforms.Compose([
    transforms.Resize(320, interpolation=InterpolationMode.BICUBIC),
    transforms.CenterCrop(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# 3. Load pretrained EfficientNet-B3 and replace classifier
from torchvision.models import EfficientNet_B3_Weights

model = models.efficientnet_b3(weights=EfficientNet_B3_Weights.IMAGENET1K_V1)
model.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1536, out_features=1000, bias=True),
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1000, out_features=768, bias=True)
)
model.eval()

# Move model to GPU if available for speed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 4. Iterate blobs, download images, extract features
features_dict = {}
for blob_props in container_client.list_blobs():
    blob_name = blob_props.name
    try:
        # Download blob into memory
        blob_client = container_client.get_blob_client(blob_name)
        download_stream = blob_client.download_blob()
        image_data = download_stream.readall()

        # Open image and preprocess
        img = Image.open(io.BytesIO(image_data)).convert('RGB')
        input_tensor = preprocess(img).unsqueeze(0).to(device)  # shape: [1, 3, 300, 300]

        # Extract features (no gradients for efficiency)
        with torch.no_grad():
            output = model(input_tensor)
        feature_vector = output.squeeze(0).cpu().numpy()  # shape: [768]

        # Store the feature vector in the dictionary
        features_dict[blob_name] = feature_vector

    except Exception as e:
        # If the blob isn't an image or an error occurs, skip it
        print(f"Skipping blob {blob_name}: {e}")
        continue

# 'features_dict' now maps each image blob name to a 768-dim NumPy feature vector
print(f"Extracted features for {len(features_dict)} images.")



In [None]:
lemmatized_words

[['prais',
  'india',
  'aarogya',
  'setu',
  'app',
  'say',
  'help',
  'identifi',
  'covid19',
  'cluster'],
 ['delhi',
  'deputi',
  'u',
  'secretari',
  'state',
  'stephen',
  'biegun',
  'pitch',
  'pax',
  'indopacifica'],
 ['india',
  'sign',
  '250',
  'document',
  'space',
  'cooper',
  '59',
  'countri',
  'isro',
  'chief'],
 ['tamil', 'nadu', 'chief', 'minist', 'mother', 'pas', 'away', '93'],
 ['bihar',
  'assembl',
  'elect',
  '2020',
  'tej',
  'pratap',
  'shift',
  'mahua',
  'hasanpur',
  ''],
 ['hathra',
  'case',
  'cbi',
  'reach',
  'victim',
  'villag',
  'visit',
  'crime',
  'scene'],
 ['rajasthan',
  'crime',
  'news',
  'karauli',
  'anoth',
  'elderli',
  'beaten',
  'death',
  'sikar',
  'five',
  'youth',
  'custodi'],
 ['mumbai',
  'bmc',
  'book',
  'penalis',
  'peopl',
  'step',
  'without',
  'face',
  'mask'],
 ['covid19',
  'india',
  'singleday',
  'spike',
  'drop',
  '55342',
  'talli',
  'approach',
  '72',
  'lakh'],
 ['amid', 'stubbl', '

In [None]:
import pandas as pd

# 1. Load your CSV
df = pd.read_csv("/content/IFND.csv",encoding="latin1")

# 2. Normalize the label strings (just in case of extra whitespace/case)
df["Label"] = df["Label"].str.strip().str.upper()

# 3. Replace with binary
#    Here TRUE → 1, FAKE → 0
df["Label"] = df["Label"].replace({"TRUE": 1, "Fake": 0})


# 5. Verify
print(df["Label"].value_counts())


Label
1       37800
FAKE    18914
Name: count, dtype: int64


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import AdamW
from transformers import DistilBertTokenizerFast, DistilBertModel
import gc

# 1. Prepare your data
# `lemmatized_words` and `labels` should already be defined
documents = [" ".join(tokens) for tokens in lemmatized_words]

# 2. TF-IDF & CountVectorizer with limited vocab to save memory
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
tfidf_feats = tfidf_vectorizer.fit_transform(documents)

count_vectorizer = CountVectorizer(max_features=3000)
count_feats = count_vectorizer.fit_transform(documents)

# 3. LDA with fewer topics
nlda = LatentDirichletAllocation(n_components=10, random_state=42)
lda_feats = nlda.fit_transform(count_feats)

# 4. Combine features sparsely
numeric_feats = hstack([tfidf_feats, lda_feats])  # scipy sparse matrix
numeric_dim = numeric_feats.shape[1]

# 5. Train-test split
X_train_text, X_val_text, X_train_num, X_val_num, y_train, y_val = train_test_split(
    documents, numeric_feats, labels, test_size=0.2, random_state=42
)

# 6. Tokenizer & Dataset
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class MultimodalDataset(Dataset):
    def __init__(self, texts, numeric_matrix, labels, tokenizer, max_length=128):
        self.texts = texts
        self.numeric = numeric_matrix
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # convert sparse row -> dense
        num = self.numeric[idx]
        if hasattr(num, 'toarray'):
            num = num.toarray().squeeze(0)
        numeric_tensor = torch.tensor(num, dtype=torch.float32)

        return {
            'input_ids': enc.input_ids.squeeze(0),
            'attention_mask': enc.attention_mask.squeeze(0),
            'numeric_feats': numeric_tensor,
            'label': torch.tensor(label2id[self.labels[idx]], dtype=torch.long)

        }

train_dataset = MultimodalDataset(X_train_text, X_train_num, y_train, tokenizer)
val_dataset   = MultimodalDataset(X_val_text,   X_val_num,   y_val,   tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=8)

# 7. Model definition
class MultimodalClassifier(nn.Module):
    def __init__(self, numeric_dim, n_classes=2):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(hidden_size + numeric_dim, n_classes)

    def forward(self, input_ids, attention_mask, numeric_feats):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        combined = torch.cat([cls_emb, numeric_feats], dim=1)
        x = self.dropout(combined)
        return self.classifier(x)

# 8. Training loop with cache clearing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalClassifier(numeric_dim=numeric_dim).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numeric = batch['numeric_feats'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numeric)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # free memory
        gc.collect()
        if torch.cuda.is_available(): torch.cuda.empty_cache()

    print(f"Epoch {epoch+1}/{epochs} - Train loss: {total_loss/len(train_loader):.4f}")

    # validation\ nmodel.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numeric = batch['numeric_feats'].to(device)
            labels = batch['label'].to(device)

            preds = model(input_ids, attention_mask, numeric).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    print(f"Validation acc: {correct/total:.4f}")

print("Training complete.")


In [None]:
# Merge image features into dataframe (for statements with images)
df_augmented['img_feat'] = df_augmented['image'].map(lambda fname: img_features.get(fname))
# Fill missing image features with zeros (for simplicity)
no_image_vec = np.zeros(resnet.output_shape[-1])
df_augmented['img_feat'] = df_augmented['img_feat'].apply(lambda x: x if isinstance(x, np.ndarray) else no_image_vec)

# Feature matrices
X_text_tfidf = X_tfidf  # (n_samples, n_features)
X_text_emb = X_emb     # (n_samples, emb_dim)
X_image = np.stack(df_augmented['img_feat'].values)
y = (df_augmented['label'] == 'fake').astype(int).values  # binary labels

# Combine TF-IDF and image for one set, embeddings and image for another, etc.
X_tfidf_img = np.hstack([X_text_tfidf, X_image])
X_emb_img = np.hstack([X_text_emb, X_image])

# Train/test split
X_tfidf_img_train, X_tfidf_img_test, y_train, y_test = train_test_split(X_tfidf_img, y, test_size=0.2, random_state=42)
X_emb_img_train, X_emb_img_test, _, _ = train_test_split(X_emb_img, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# Logistic Regression on TF-IDF alone
lr = LogisticRegression(max_iter=1000)
lr.fit(X_text_tfidf[y_train.index], y_train)   # text only (if we separated earlier)
y_pred_lr = lr.predict(X_text_tfidf[y_test.index])
print("Logistic Regression (text only):", accuracy_score(y_test, y_pred_lr))

# Decision Tree on TF-IDF + image
dt = DecisionTreeClassifier()
dt.fit(X_tfidf_img_train, y_train)
y_pred_dt = dt.predict(X_tfidf_img_test)
print("Decision Tree (TF-IDF+image):", accuracy_score(y_test, y_pred_dt))

# Naive Bayes on TF-IDF
nb = MultinomialNB()
nb.fit(X_tfidf_img_train, y_train)  # Note: MultinomialNB expects non-negative features
y_pred_nb = nb.predict(X_tfidf_img_test)
print("Naive Bayes (TF-IDF+image):", accuracy_score(y_test, y_pred_nb))


In [None]:
# Prepare text sequences for LSTM
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df_augmented['clean_statement'])
seqs = tokenizer.texts_to_sequences(df_augmented['clean_statement'])
seqs_padded = pad_sequences(seqs, maxlen=100)
X_seq = seqs_padded
X_seq_train, X_seq_test, _, _ = train_test_split(X_seq, y, test_size=0.2, random_state=42)

# LSTM model
input_layer = Input(shape=(100,))
x = Embedding(input_dim=10000, output_dim=128)(input_layer)
x = LSTM(64)(x)
x = Dropout(0.5)(x)
output_layer = Dense(1, activation='sigmoid')(x)
lstm_model = Model(input_layer, output_layer)
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_seq_train, y_train, epochs=2, batch_size=32, validation_split=0.1)  # brief training
y_pred_lstm = (lstm_model.predict(X_seq_test) > 0.5).astype(int)
print("LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm))


In [None]:
# Bi-LSTM model
input_layer = Input(shape=(100,))
x = Embedding(input_dim=10000, output_dim=128)(input_layer)
x = Bidirectional(LSTM(64))(x)
x = Dropout(0.5)(x)
output_layer = Dense(1, activation='sigmoid')(x)
bilstm_model = Model(input_layer, output_layer)
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_seq_train, y_train, epochs=2, batch_size=32, validation_split=0.1)
y_pred_bilstm = (bilstm_model.predict(X_seq_test) > 0.5).astype(int)
print("Bi-LSTM Accuracy:", accuracy_score(y_test, y_pred_bilstm))


In [None]:
# DistilBERT fine-tuning (using Trainer API)
tokenizer_distil = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
inputs = tokenizer_distil(list(df_augmented['clean_statement']), padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(y)
dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
train_size = int(0.8 * len(dataset))
train_ds, test_ds = torch.utils.data.random_split(dataset, [train_size, len(dataset)-train_size])

model_distil = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, per_device_train_batch_size=8)
trainer = Trainer(model=model_distil, args=training_args, train_dataset=train_ds)
trainer.train()
preds = trainer.predict(test_ds)
y_pred_distil = np.argmax(preds.predictions, axis=1)
print("DistilBERT Accuracy:", accuracy_score(y_test, y_pred_distil))


In [None]:
# Example: Use pre-trained VGG16 for image classification
from tensorflow.keras.applications.vgg16 import VGG16
vgg = VGG16(weights='imagenet', include_top=False, pooling='avg')
# Extract features for images and train a small neural network classifier

In [None]:
from tensorflow.keras.layers import Concatenate

# Example MLP on combined text embedding + image feature
text_input = Input(shape=(X_emb.shape[1],))
img_input = Input(shape=(X_image.shape[1],))
merged = Concatenate()([text_input, img_input])
x = Dense(256, activation='relu')(merged)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)
mlp = Model([text_input, img_input], output)
mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Prepare inputs for fusion model
X_comb_train = [X_text_emb[y_train.index], X_image[y_train.index]]
X_comb_test = [X_text_emb[y_test.index], X_image[y_test.index]]
mlp.fit(X_comb_train, y_train, epochs=2, batch_size=32, validation_split=0.1)
y_pred_mlp = (mlp.predict(X_comb_test) > 0.5).astype(int)
print("Fusion MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))


In [None]:
from sklearn.metrics import classification_report

print("Classification Report for Fusion MLP:")
print(classification_report(y_test, y_pred_mlp))
cm = confusion_matrix(y_test, y_pred_mlp)
print("Confusion Matrix:\n", cm)


In [None]:
# (Pseudo-code) Insert documents into Azure Cosmos Vector DB
from azure.cosmos import CosmosClient
from azure.cosmos.partition_key import PartitionKey

# Create a container with vector indexing enabled (requires Azure setup)
vector_container = database.get_container_client("vector_store")
for idx, row in df_augmented.iterrows():
    embedding = st_model.encode(row['clean_statement']).tolist()
    vector_container.upsert_item({
        'id': str(row['id']),
        'statement': row['clean_statement'],
        'label': row['label'],
        'embedding': embedding  # stored as a vector type
    })

# (Pseudo-code) RAG Query: Given a statement, retrieve similar docs and call LLM
def rag_fact_check(query_text):
    query_emb = st_model.encode([query_text]).tolist()[0]
    sql_query = {
        'query': "SELECT TOP 3 c.statement FROM c ORDER BY ST_DISTANCE(c.embedding, @q) ASC",
        'parameters': [{'name': '@q', 'value': query_emb}]
    }
    docs = list(vector_container.query_items(query=sql_query, enable_cross_partition_query=True))
    context = "\n\n".join([d['statement'] for d in docs])
    # Use OpenAI Completion with context
    from openai import OpenAI
    client = OpenAI(api_key="<YOUR_OPENAI_KEY>")
    response = client.chat.completions.create(
        model="gpt-4o-preview",
        messages=[
            {"role":"system", "content": "Check the statement using the context."},
            {"role":"user", "content": f"Statement: {query_text}\nContext:{context}"}
        ]
    )
    return response.choices[0].message.content
