In [20]:

import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, TensorDataset

# Prepare data

## Read input

In [6]:
input_file = './data.csv'
column_names = ['target', 'id','date','flag','user','text']
input_df = pd.read_csv(input_file, names=column_names,encoding='latin-1')[["id","target", "text"]] \
    .sample(100)

## Download stopwords

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/weronikaskiba/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Clean input

In [8]:
transformed_df = input_df \
    .dropna() 
transformed_df['target'] = transformed_df['target'].map({0: 0, 4: 1}) # Normalize target values

## Stemming

In [9]:
def steeming_transform(text):
    stemmer = PorterStemmer()
    with_removed_non_letter_signs = re.sub('[^a-zA-Z]', ' ', text)
    with_lower_case = with_removed_non_letter_signs.lower()
    with_word_tokenization = with_lower_case.split()
    with_stemmed = [stemmer.stem(word) for word in with_word_tokenization if not word in set(stopwords.words('english'))]
    output = ' '.join(with_stemmed)
    return output

In [10]:
transformed_df['processed_text'] = transformed_df['text'].apply(steeming_transform)

## Indexing

In [11]:
X = transformed_df['processed_text'].values
Y = transformed_df['target'].values

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

## Dataset

In [13]:
class TweetDataset(Dataset):
    
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
    
    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self,index):
        return self.x[index],self.y[index]

In [14]:
test_dataset = TweetDataset(X_test,Y_test)
train_dataset = TweetDataset(X_train,Y_train)   

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=1)
test_dataloader = DataLoader(test_dataset, batch_size=1)

In [40]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_out = lstm_out[:, -1, :] 
        out = self.fc(last_out)
        output = self.sigmoid(out)
        return output


In [49]:
input_size = 1
hidden_size = 256
output_size = 2   
num_layers = 2     

model = LSTM(input_size, hidden_size, output_size, num_layers)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [51]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch in train_dataloader:
        inputs, targets = batch
        inputs = inputs.float()
        inputs = inputs.unsqueeze(-1)
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_dataloader):.4f}")


Epoch [1/10], Loss: 0.6952
Epoch [2/10], Loss: 0.6918
Epoch [3/10], Loss: 0.6905
Epoch [4/10], Loss: 0.6895
Epoch [5/10], Loss: 0.6889
Epoch [6/10], Loss: 0.6884
Epoch [7/10], Loss: 0.6880
Epoch [8/10], Loss: 0.6878
Epoch [9/10], Loss: 0.6876
Epoch [10/10], Loss: 0.6874


In [53]:

def evaluate(model, dataloader):
    model.eval() 
    correct = 0
    total = 0
    epoch_loss = 0

    with torch.no_grad(): 
        for batch in dataloader:
            inputs, targets = batch
            inputs = inputs.float()
            inputs = inputs.unsqueeze(-1) 
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            epoch_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    avg_loss = epoch_loss / len(dataloader)

    print(f"Evaluation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")
    return avg_loss, accuracy

test_loss, test_accuracy = evaluate(model, test_dataloader) 


Evaluation Loss: 0.6881, Accuracy: 55.00%
