In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
train_df[train_df['target'] == 1].head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
def preprocess_tweet(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join back to string
    return ' '.join(tokens)

In [9]:
train_df['cleaned_text'] = train_df['text'].apply(preprocess_tweet)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_tweet)

In [10]:
train_df['cleaned_text'].head(10)

0           deed reason earthquake may allah forgive u
1                forest fire near la ronge sask canada
2    resident asked shelter place notified officer ...
3    people receive wildfire evacuation order calif...
4    got sent photo ruby alaska smoke wildfire pour...
5    rockyfire update california hwy closed directi...
6    flood disaster heavy rain cause flash flooding...
7                            im top hill see fire wood
8    there emergency evacuation happening building ...
9                        im afraid tornado coming area
Name: cleaned_text, dtype: object

In [15]:
import spacy

nlp = spacy.load('en_core_web_md')  # Load the medium-sized English model

def get_tweet_vector(text):
    doc = nlp(text)
    return doc.vector  # 300-dimensional GloVe vector

tweet_vectors = train_df['cleaned_text'].apply(get_tweet_vector)
tweet_vectors = np.vstack(tweet_vectors.values)

In [19]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

X = torch.tensor(tweet_vectors, dtype=torch.float32)
y = torch.tensor(train_df['target'].values, dtype=torch.float32)

dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [27]:
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [31]:
# Define the GRU Model
class GRUClassifier(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=256, output_dim=2):
        super().__init__()
        self.gru = nn.GRU(
            input_dim,
            hidden_dim,
            num_layers=2,
            bidirectional=True,
            dropout=0.3,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # x2 for bidirectional
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # Add sequence dimension (batch_size, seq_len=1, features)
        x = x.unsqueeze(1)
        gru_out, _ = self.gru(x)
        # Take last hidden state
        out = self.dropout(gru_out[:, -1, :])
        return self.fc(out)


In [32]:
# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GRUClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [33]:
model.train()


GRUClassifier(
  (gru): GRU(300, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [34]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1f11b3b54b0>

In [35]:
# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    
    # Training phase
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels.long())
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss/len(train_loader):.4f}')
    print(f'Val Loss: {val_loss/len(val_loader):.4f}')
    print(f'Val Accuracy: {100*correct/total:.2f}%')
    print('--------------------------')

Epoch 1/20
Train Loss: 0.5366
Val Loss: 0.5190
Val Accuracy: 76.03%
--------------------------
Epoch 2/20
Train Loss: 0.4948
Val Loss: 0.4760
Val Accuracy: 78.00%
--------------------------
Epoch 3/20
Train Loss: 0.4775
Val Loss: 0.4863
Val Accuracy: 78.53%
--------------------------
Epoch 4/20
Train Loss: 0.4721
Val Loss: 0.4610
Val Accuracy: 78.59%
--------------------------
Epoch 5/20
Train Loss: 0.4592
Val Loss: 0.4658
Val Accuracy: 78.33%
--------------------------
Epoch 6/20
Train Loss: 0.4509
Val Loss: 0.4638
Val Accuracy: 79.58%
--------------------------
Epoch 7/20
Train Loss: 0.4445
Val Loss: 0.4748
Val Accuracy: 78.40%
--------------------------
Epoch 8/20
Train Loss: 0.4333
Val Loss: 0.4853
Val Accuracy: 77.87%
--------------------------
Epoch 9/20
Train Loss: 0.4315
Val Loss: 0.5025
Val Accuracy: 77.35%
--------------------------
Epoch 10/20
Train Loss: 0.4215
Val Loss: 0.4880
Val Accuracy: 79.25%
--------------------------
Epoch 11/20
Train Loss: 0.4170
Val Loss: 0.4735
V

In [41]:
tweet_vectors_test = test_df['cleaned_text'].apply(get_tweet_vector)
tweet_vectors_test = np.vstack(tweet_vectors_test.values)
X_test = torch.tensor(tweet_vectors_test, dtype=torch.float32).to(device)

with torch.no_grad():
    outputs = model(X_test)
    preds = torch.argmax(outputs, dim=1)

In [43]:
pred_labels = preds.cpu().numpy()
submission_df = pd.DataFrame({'id': test_df['id'], 'target': pred_labels})

In [45]:
submission_df.to_csv('submission.csv', index=False)