<a href="https://colab.research.google.com/github/jahnavirishikesh/contradiction_detection/blob/main/contradiction_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing import sequence

In [9]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# For training data
X_train = train_data[['SENTENCE A', 'SENTENCE B']]
y_train = train_data['label']

# For testing data
X_test = test_data[['Question', 'Answer 1', 'Answer 2']]
y_test = test_data['label']

# Handle missing values in the test dataset
X_test = X_test.fillna('')

In [10]:
# Tokenize the text data and pad the sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['SENTENCE A'] + X_train['SENTENCE B'])
X_train_seq_A = tokenizer.texts_to_sequences(X_train['SENTENCE A'])
X_train_seq_B = tokenizer.texts_to_sequences(X_train['SENTENCE B'])

X_test_seq = tokenizer.texts_to_sequences(X_test['Question'] + X_test['Answer 1'] + X_test['Answer 2'])

# Pad sequences to a fixed length
max_sequence_length = 100
X_train_padded_A = pad_sequences(X_train_seq_A, maxlen=max_sequence_length, padding='post')
X_train_padded_B = pad_sequences(X_train_seq_B, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

In [11]:
# Build the LSTM model
model = Sequential()
embedding_dim = 100

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, concatenate

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# For training data
X_train = train_data[['SENTENCE A', 'SENTENCE B']]
y_train = train_data['label']

# For testing data
X_test = test_data[['Question', 'Answer 1', 'Answer 2']]
y_test = test_data['label']

# Handle missing values in the test dataset
X_test = X_test.fillna('')

# Tokenize and pad the text data
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['SENTENCE A'] + X_train['SENTENCE B'])

X_train_seq_A = tokenizer.texts_to_sequences(X_train['SENTENCE A'])
X_train_seq_B = tokenizer.texts_to_sequences(X_train['SENTENCE B'])
X_test_seq_A = tokenizer.texts_to_sequences(X_test['Question'])
X_test_seq_B = tokenizer.texts_to_sequences(X_test['Answer 1'])

X_train_padded_A = pad_sequences(X_train_seq_A, maxlen=max_sequence_length)
X_train_padded_B = pad_sequences(X_train_seq_B, maxlen=max_sequence_length)
X_test_padded_A = pad_sequences(X_test_seq_A, maxlen=max_sequence_length)
X_test_padded_B = pad_sequences(X_test_seq_B, maxlen=max_sequence_length)

# Define the model using Functional API
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

input_A = Input(shape=(max_sequence_length,))
input_B = Input(shape=(max_sequence_length,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

embedded_A = embedding_layer(input_A)
embedded_B = embedding_layer(input_B)

lstm_layer = LSTM(64)

lstm_output_A = lstm_layer(embedded_A)
lstm_output_B = lstm_layer(embedded_B)

concatenated_output = concatenate([lstm_output_A, lstm_output_B], axis=-1)

dense_layer = Dense(64, activation='relu')(concatenated_output)
output = Dense(1, activation='sigmoid')(dense_layer)

model = Model(inputs=[input_A, input_B], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
epochs = 50
batch_size = 64
model.fit(x=[X_train_padded_A, X_train_padded_B], y=y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(x=[X_test_padded_A, X_test_padded_B], y=y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 100, 100)             453900    ['input_5[0][0]',             
                                                                     'input_6[0][0]']             
                                                                                                  
 lstm_4 (LSTM)               (None, 64)                   42240     ['embedding_3[0][0]',   

In [25]:
# Assuming X_test_padded_A and X_test_padded_B are defined from previous preprocessing

# Choose two sentences for testing
sentence_A = "The sky is blue."
sentence_B = "The sky is red."

# Tokenize and pad the test sentences
test_seq_A = tokenizer.texts_to_sequences([sentence_A])
test_seq_B = tokenizer.texts_to_sequences([sentence_B])
test_padded_A = pad_sequences(test_seq_A, maxlen=max_sequence_length)
test_padded_B = pad_sequences(test_seq_B, maxlen=max_sequence_length)

# Make predictions
predictions = model.predict([test_padded_A, test_padded_B])

# Interpret the predictions
for i in range(len(predictions)):
    print(f"Sentence A: '{sentence_A}'")
    print(f"Sentence B: '{sentence_B}'")
    print(f"Prediction (probability of contradiction): {predictions[i][0]:.4f}")
    if predictions[i][0] > 0.5:
        print("Prediction: Contradiction")
    else:
        print("Prediction: No Contradiction")
    print()


Sentence A: 'The sky is blue.'
Sentence B: 'The sky is red.'
Prediction (probability of contradiction): 0.0001
Prediction: No Contradiction



In [14]:
print (X_train_padded_A)

[[   3 1380  526 ...    0    0    0]
 [1027 2196  649 ...    0    0    0]
 [ 390  771 1924 ...    0    0    0]
 ...
 [   1   22    2 ...    0    0    0]
 [   3   18   91 ...    0    0    0]
 [   1    4    2 ...    0    0    0]]


In [15]:
# Train the model
epochs = 10
batch_size = 64
model.fit(x=list([X_train_padded_A, X_train_padded_B], y=y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Epoch 1/10


ValueError: ignored