In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Load train and test data
train_data = pd.read_csv("/kaggle/input/ai-for-social-good-aries-iitd-x-kaizen-24/Public/Task 1/train.csv")
test_data = pd.read_csv("/kaggle/input/ai-for-social-good-aries-iitd-x-kaizen-24/Public/Task 1/test.csv")

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['tweet_text'])

# Preprocess train and test data
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['tweet_text']), maxlen=100)
y_train = train_data['claim']
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['tweet_text']), maxlen=100)

# Define and compile the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Make predictions on test data
test_predictions = model.predict(X_test)
test_predictions = np.round(test_predictions).flatten()

# Create submission DataFrame
submission_df = pd.DataFrame({
    'ID': test_data['tweet_text'],  # Assuming 'ID' column exists in test_data
    'text': test_data['tweet_text'],
    'claim': test_predictions,
    'span_start_index': [-1] * len(test_data),
    'span_end_index': [-2] * len(test_data),
    'task': [1] * len(test_data)
})

# Save predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)
