<a href="https://colab.research.google.com/github/jahnavirishikesh/contradiction_detection/blob/main/contradiction_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!python --version

Python 3.10.12


In [30]:
!pip install pandas numpy tensorflow keras



In [37]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout, GlobalMaxPooling1D, Dot
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Bidirectional
import re

# Setting random seed for reproducibility
seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Prepare training and testing data
X_train = train_data[['SENTENCE A', 'SENTENCE B']]
y_train = train_data['label']
X_test = test_data[['Question', 'Answer 1', 'Answer 2']]
y_test = test_data['label']

# Handle missing values
X_test = X_test.fillna('')

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Clean the text data
X_train['SENTENCE A'] = X_train['SENTENCE A'].apply(clean_text)
X_train['SENTENCE B'] = X_train['SENTENCE B'].apply(clean_text)
X_test['Question'] = X_test['Question'].apply(clean_text)
X_test['Answer 1'] = X_test['Answer 1'].apply(clean_text)

# Tokenizing and padding data
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['SENTENCE A'].tolist() + X_train['SENTENCE B'].tolist())

X_train_seq_A = tokenizer.texts_to_sequences(X_train['SENTENCE A'])
X_train_seq_B = tokenizer.texts_to_sequences(X_train['SENTENCE B'])
X_test_seq_A = tokenizer.texts_to_sequences(X_test['Question'])
X_test_seq_B = tokenizer.texts_to_sequences(X_test['Answer 1'])

X_train_padded_A = pad_sequences(X_train_seq_A, maxlen=max_sequence_length)
X_train_padded_B = pad_sequences(X_train_seq_B, maxlen=max_sequence_length)
X_test_padded_A = pad_sequences(X_test_seq_A, maxlen=max_sequence_length)
X_test_padded_B = pad_sequences(X_test_seq_B, maxlen=max_sequence_length)

# Defining model architecture
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

input_A = Input(shape=(max_sequence_length,))
input_B = Input(shape=(max_sequence_length,))

# Attention mechanism function
def attention_mechanism(inputs):
    attention_scores = Dense(max_sequence_length, activation='softmax')(inputs)
    context_vector = Dot(axes=[1, 1])([attention_scores, inputs])
    return context_vector

# Embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

embedded_A = embedding_layer(input_A)
embedded_B = embedding_layer(input_B)

# Bidirectional LSTM layer
lstm_output_A = Bidirectional(LSTM(128, return_sequences=True))(embedded_A)
lstm_output_B = Bidirectional(LSTM(128, return_sequences=True))(embedded_B)

# Applying attention mechanism to LSTM outputs
attention_A = attention_mechanism(lstm_output_A)
attention_B = attention_mechanism(lstm_output_B)

# Global max pooling for feature extraction
pooled_A = GlobalMaxPooling1D()(attention_A)
pooled_B = GlobalMaxPooling1D()(attention_B)

# Adding dropout layers to prevent overfitting
pooled_A = Dropout(0.5)(pooled_A)
pooled_B = Dropout(0.5)(pooled_B)

# Concatenating pooled outputs from both sentences
concatenated_output = concatenate([pooled_A, pooled_B], axis=-1)

# Adding Dropout after concatenation for regularization
concatenated_output = Dropout(0.5)(concatenated_output)

# Dense layers for feature extraction and dropout for regularization
dense_layer_1 = Dense(128, activation='relu')(concatenated_output)
dense_layer_1 = Dropout(0.5)(dense_layer_1)
dense_layer_2 = Dense(64, activation='relu')(dense_layer_1)
output = Dense(1, activation='sigmoid')(dense_layer_2)

# Define and compile model
model = Model(inputs=[input_A, input_B], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Implementing learning rate reduction on plateau and early stopping callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Training the model with different batch sizes
batch_sizes = [32, 64]
for batch_size in batch_sizes:
    print(f'Training with batch size: {batch_size}')
    model.fit(x=[X_train_padded_A, X_train_padded_B], y=y_train,
              epochs=100,
              batch_size=batch_size,
              validation_split=0.2,
              verbose=1,
              callbacks=[reduce_lr])

# Evaluating the model on test data and printing results
loss, accuracy = model.evaluate(x=[X_test_padded_A, X_test_padded_B], y=y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SENTENCE A'] = X_train['SENTENCE A'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SENTENCE B'] = X_train['SENTENCE B'].apply(clean_text)


Training with batch size: 32
Epoch 1/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - accuracy: 0.6757 - loss: 0.6395 - val_accuracy: 0.8555 - val_loss: 0.4837 - learning_rate: 0.0010
Epoch 2/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.6863 - loss: 0.6164 - val_accuracy: 0.8534 - val_loss: 0.4299 - learning_rate: 0.0010
Epoch 3/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.7535 - loss: 0.5139 - val_accuracy: 0.8637 - val_loss: 0.3260 - learning_rate: 0.0010
Epoch 4/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.8397 - loss: 0.3833 - val_accuracy: 0.8657 - val_loss: 0.2977 - learning_rate: 0.0010
Epoch 5/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.8515 - loss: 0.3393 - val_accuracy: 0.8596 - val_loss: 0.3230 - learning_rate: 0.0010
Epoch 6/100
[1m184/184[0m [

In [44]:
# Two sentences for testing
sentence_A = "The speed is fast"
sentence_B = "The speed is slow"

# Tokenizing and padding the test sentences
test_seq_A = tokenizer.texts_to_sequences([sentence_A])
test_seq_B = tokenizer.texts_to_sequences([sentence_B])
test_padded_A = pad_sequences(test_seq_A, maxlen=max_sequence_length)
test_padded_B = pad_sequences(test_seq_B, maxlen=max_sequence_length)

# Making predictions
predictions = model.predict([test_padded_A, test_padded_B])

probability = predictions[0][0]
print(f"Sentence A: '{sentence_A}'")
print(f"Sentence B: '{sentence_B}'")
print(f"Prediction (probability of contradiction): {probability:.4f}")

# Determine prediction
threshold = 0.4
if probability > threshold:
    prediction_result = "Contradiction"
else:
    prediction_result = "No Contradiction"

print(f"Prediction: {prediction_result}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Sentence A: 'The speed is fast'
Sentence B: 'The speed is slow'
Prediction (probability of contradiction): 0.9395
Prediction: Contradiction
