In [None]:
#Run this commands in Google Colab
#!pip install torch torchvision torchaudio re
#!pip install scikit-learn pandas

In [1]:
import pandas as pd
import re
import torch
import torch.nn as nn
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

# Prepare data

## Read input

In [None]:
#Run this cell to access data in google drive on google colab
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
input_file = './data.csv'
#input_file = '/content/drive/MyDrive/data.csv' google colab
column_names = ['target','id','date','flag','user','text']
input_df = pd.read_csv(input_file, names=column_names,encoding='latin-1')[["id","target", "text"]] \
    .sample(100)

## Download stopwords

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/weronikaskiba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Clean input

In [4]:
transformed_df = input_df \
    .dropna() 
transformed_df['target'] = transformed_df['target'].map({0: 0, 4: 1}) # Normalize target values

## Stemming

In [5]:
def stemming_transform(text):
    stemmer = PorterStemmer()
    with_removed_non_letter_signs = re.sub('[^a-zA-Z]', ' ', text)
    with_lower_case = with_removed_non_letter_signs.lower()
    with_word_tokenization = with_lower_case.split()
    with_stemmed = [stemmer.stem(word) for word in with_word_tokenization if not word in set(stopwords.words('english'))]
    output = ' '.join(with_stemmed)
    return output

In [6]:
transformed_df['processed_text'] = transformed_df['text'].apply(stemming_transform)

## Indexing

In [7]:
X = transformed_df['processed_text'].values
Y = transformed_df['target'].values

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

## Dataset

In [9]:
class TweetDataset(Dataset):
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,index):
        return self.x[index],self.y[index]

In [10]:
test_dataset = TweetDataset(X_test,Y_test)
train_dataset = TweetDataset(X_train,Y_train)   

In [11]:
#train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [13]:
#device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [12]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        output = self.fc(lstm_out[:, -1, :])
        return output


In [281]:
batch_size = 16
hidden_size = 64
output_size = 2
num_layers = 1
num_epochs = 10
learning_rate = 0.01
input_size = X_train.shape[1]

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True) #random_state=42

criterion = torch.nn.CrossEntropyLoss().to(device) 

In [282]:
best_val_loss = float('inf')
best_model_params = None  

for fold, (train_indices, val_indices) in enumerate(kf.split(train_dataset)):
    #print(f"Fold {fold+1}/{k_folds}")
    
    train_subset = Subset(train_dataset, train_indices)
    val_subset = Subset(train_dataset, val_indices)
    
    train_dataloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
    
    model = LSTM(input_size, hidden_size, output_size, num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        for batch_features, batch_labels in train_dataloader:
            batch_features = batch_features.unsqueeze(1).to(torch.float32).to(device)
            batch_labels = batch_labels.to(torch.long).to(device)
            
            predictions = model(batch_features)
            loss = criterion(predictions, batch_labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        #print(f"Fold {fold+1}, Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss/len(train_dataloader):.4f}")
    
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_labels in val_dataloader:
            batch_features = batch_features.unsqueeze(1).to(torch.float32).to(device)
            batch_labels = batch_labels.to(torch.long).to(device)
            
            predictions = model(batch_features)
            loss = criterion(predictions, batch_labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(predictions, 1)
            correct += (predicted == batch_labels).sum().item()
            total += batch_labels.size(0)
    
    val_accuracy = correct / total
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Fold {fold+1}, Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_params = model.state_dict().copy()  # Save the best model parameters

model.load_state_dict(best_model_params)
print(f"Best model loaded with Validation Loss: {best_val_loss:.4f}")


Fold 1, Validation Loss: 0.9460, Accuracy: 0.6250
Fold 2, Validation Loss: 0.9126, Accuracy: 0.3125
Fold 3, Validation Loss: 0.4190, Accuracy: 0.8125
Fold 4, Validation Loss: 0.7999, Accuracy: 0.5625
Fold 5, Validation Loss: 0.6749, Accuracy: 0.6875
Best model loaded with Validation Loss: 0.4190


In [283]:
torch.save(model.state_dict(), 'model.pth')
print("Model saved to 'model.pth'")

Model saved to 'model.pth'


In [284]:
model.eval() 
test_loss = 0.0
correct_predictions = 0
total_predictions = 0
all_predictions = []
all_labels = []

with torch.no_grad(): 
    for batch_features, batch_labels in test_dataloader:
        batch_features = batch_features.unsqueeze(1).to(torch.float32).to(device)
        batch_labels = batch_labels.to(torch.long).to(device)

        predictions = model(batch_features)

        loss = criterion(predictions, batch_labels)
        test_loss += loss.item()

        _, predicted_classes = torch.max(predictions, dim=1)

        all_predictions.extend(predicted_classes.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

        correct_predictions += (predicted_classes == batch_labels).sum().item()
        total_predictions += batch_labels.size(0)

avg_loss = test_loss / len(test_dataloader)

accuracy = correct_predictions / total_predictions

precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
f1 = f1_score(all_labels, all_predictions, average='weighted')

print(f"Test Loss: {avg_loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


Test Loss: 0.5808
Test Accuracy: 55.00%
Test Precision: 0.5550
Test Recall: 0.5500
Test F1 Score: 0.5511
