In [None]:
! head data/twitter_training.csv

In [None]:
import pandas as pd
twits_raw = pd.read_csv("data/twitter_training.csv", header=None, 
            names=['id', 'category','polarity', 'text'])
twits_raw.head(5)

In [None]:
print(twits_raw.shape)
twits_raw =  twits_raw.dropna()
print(twits_raw.shape)

In [None]:
some_text = twits_raw.loc[0, 'text']
some_text

In [None]:
import re

def split_and_clean(text):
    words = re.split(r'\s+', text)
    cleaned_words = [re.sub(r'[^a-zA-Z]', '', word) for word in words]
    cleaned_words = [word.lower() for word in cleaned_words if word]
    return cleaned_words

split_and_clean(some_text)

In [None]:
vocabulary = [w for t in twits_raw.text for w in split_and_clean(t)]
vocabulary[:10]

In [None]:
from collections import Counter
voc_counter = Counter(vocabulary)
voc_counter.most_common(20)

In [None]:
vocabulary = [word for word, counter in voc_counter.items() if counter > 10]
vocabulary[:10]

In [None]:
len(vocabulary)

In [None]:
twits_raw.head(3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=20, token_pattern=r'\b[a-zA-Z]+\b')
X = vectorizer.fit_transform(twits_raw['text'])

In [None]:
bag_of_words_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
bag_of_words_df.head(10)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)  # Select the number of components you want to keep
principal_components = pca.fit_transform(bag_of_words_df)

In [None]:
pca.explained_variance_ratio_

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a new DataFrame with the principal components
principal_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
principal_df['polarity'] = twits_raw.polarity

sns.scatterplot(data=principal_df, x='PC1', y='PC2', hue='polarity', palette='viridis')

In [None]:
PCA_COMPONENTS = 100

pca = PCA(n_components=PCA_COMPONENTS)  # Select the number of components you want to keep
principal_components = pca.fit_transform(bag_of_words_df)
pca.explained_variance_ratio_

In [None]:
import numpy as np
np.sum(pca.explained_variance_ratio_)

Lets build a dataset for training a neural network on those data

In [None]:
import torch
import torch.nn as nn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
X = principal_components
X.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(twits_raw.polarity)
y[:6]

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float), 
    torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(
    torch.tensor(X_test, dtype=torch.float), 
    torch.tensor(y_test, dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=True)


In [None]:
data, label = next(iter(train_loader))
data.shape, label.shape

In [None]:
# Define the custom model class
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.layer1 = nn.Linear(PCA_COMPONENTS, 20)
        self.layer2 = nn.Linear(20, 20)
        self.layer3 = nn.Linear(20, 4)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, xs):
        x = self.layer1(xs).tanh()
        x = self.layer2(x).tanh()
        x = self.layer3(x)
        # x = self.softmax(x)
        return x
    
mymodel = MyModel().to(device)
sum([n.nelement() for n in mymodel.parameters()])

In [None]:
predicted = mymodel(data.to(device))
predicted.shape

In [None]:
nn.CrossEntropyLoss()(predicted, label.to(device))

In [None]:
import torch.optim as optim

# Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(mymodel.parameters(), lr=0.01)

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    mymodel.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # forward pass
        outputs = mymodel(X_batch)
        loss = loss_fn(outputs, y_batch)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # update
        optimizer.step()
        running_loss += loss.item()
    if epoch % (num_epochs // 10) == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

print("Training complete")


In [None]:
# Evaluation loop
mymodel.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        outputs = mymodel(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"Accuracy: {100 * correct / total}%")

You can play with all the parameters, but the accuracy might not significantly improves. So processing text need other ideas in order to perform correctly.