In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import numpy as np
import pandas as pd
import torch.nn.functional as F
import gensim
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

In [3]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print("Using GPU:")
    print(torch.cuda.get_device_name(0))  # 0 is the index of the GPU
else:
    print("Using CPU")

Using GPU:
Quadro T1000 with Max-Q Design


In [4]:
df_news = pd.read_csv('../input/News.csv')

In [5]:
# load the saved word2vec model and produce document vectors
def document_vector(model, doc):
    doc = [word for word in doc if word in model.wv.index_to_key]
    # If no words are in the vocabulary, return a vector of zeros
    if len(doc) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

In [8]:
model_w2v = gensim.models.Word2Vec.load("..\models\word2vec_model.model")
df_news['tokenized_text'] = df_news['text'].apply(gensim.utils.simple_preprocess)

In [9]:
# remove stopwords to make the doc vector creation faster
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

def remove_stopwords_from_tokens(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# # Apply the function to the 'text' column
df_news['no_stopwords'] = df_news['tokenized_text'].apply(remove_stopwords_from_tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jlari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jlari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jlari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
df_news['tokenized_text_2'] = df_news['no_stopwords'].apply(gensim.utils.simple_preprocess)

In [11]:
df_news['vector'] = [document_vector(model_w2v, doc) for doc in df_news.tokenized_text_2]

In [12]:
df_news.to_pickle('df_news.pkl')

## Real ML Part

In [13]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [14]:
class VectorizedTextDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label

In [16]:
df_training = df_news[['vector', 'subject', 'Real']]

# Convert features to numpy arrays
X = np.array(df_training['vector'].tolist()) # feature
y = np.array(df_training['Real'].tolist()) # label

# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Dataset instances
train_dataset = VectorizedTextDataset(X_train, y_train) # get the data in a structured dataset (abstract class that wraps around). 
test_dataset = VectorizedTextDataset(X_test, y_test)

# Create DataLoader instances
# this is for loading the data onto the GPU
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [17]:
# nn: neural network model in pytorch

class TextClassifier(nn.Module):
    def __init__(self, input_dim):
        super(TextClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.fc(x)
        x = self.sigmoid(x)
        return x
    
# Model, Loss, Optimizer
input_dim = X_train.shape[1]  # Number of features
model = TextClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):
    model.train()
    for features, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.45559924840927124
Epoch 2, Loss: 0.04193436726927757
Epoch 3, Loss: 0.04429787024855614
Epoch 4, Loss: 0.5187382102012634
Epoch 5, Loss: 0.0037550206761807203


In [18]:
def evaluate_model(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in loader:
            outputs = model(features)
            predicted = (outputs.squeeze() > 0.5).long()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return accuracy

print(f'Train Accuracy: {evaluate_model(train_loader)}')
print(f'Test Accuracy: {evaluate_model(test_loader)}')

Train Accuracy: 0.9567069435937413
Test Accuracy: 0.9580178173719376


In [25]:
test_loader

<torch.utils.data.dataloader.DataLoader at 0x24b58258550>

In [24]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
# Get predictions and true labels
y_true = [label for _, label in test_loader]
y_pred = [model(features).squeeze().round() for features, _ in test_loader]

# Compute Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_true, y_pred)

# Plot Precision-Recall curve
plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

ValueError: multilabel-indicator format is not supported

## Comparison with tree classifier

Pros: faster to implement


In [19]:
df_news.shape

(44898, 9)

TODO: make plots, make a separate validation, maybe make it a bit prettier, 

## Comparison with BERT 