In [1]:
import pymongo
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["story_database"]  # Database name
collection = db["short_stories"]

documents = collection.find()

df_raw = pd.DataFrame(list(documents))
df_raw = df_raw[["text", "label"]]
df_raw.head(5)

Unnamed: 0,text,label
0,Transcriber's Note:\nEvery effort has been mad...,0
1,"“In a few moments Marianne, Solomin, Paul,\n\n...",0
2,“Marianne knelt beside the sofa.… Nezhdanof\n\...,0
3,"“Solomin raised Marianne's hand, her head\n\nl...",0
4,"“He now was no longer, but the hands of\n\nSol...",0


In [2]:
# Define the cleaning function
def clean_text(text):

    text = text.replace('\n', ' ') 
    text = re.sub(r'=', '', text)  

    # Keep only English alphabet characters and spaces 
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

df_raw['cleaned_text'] = df_raw['text'].apply(clean_text)

df = df_raw[["cleaned_text", "label"]]

# Vectorize the data


In [3]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

df_shuffled = df.sample(frac=1).reset_index(drop=True)


X = df_shuffled["cleaned_text"]
y = df_shuffled["label"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


clf = LogisticRegression(max_iter=1000)  # max_iter to ensure convergence
clf.fit(X_train, y_train)
        
y_pred = clf.predict(X_test)

cleaned_data = (classification_report(y_test, y_pred))
print(cleaned_data)

              precision    recall  f1-score   support

           0       1.00      0.74      0.85        42
           1       0.80      1.00      0.89        45

    accuracy                           0.87        87
   macro avg       0.90      0.87      0.87        87
weighted avg       0.90      0.87      0.87        87



Wow this was a bit anticlimactic, lets mess the data up see if we get the same results

# Lets not clean the data!

In [4]:
df_shuffled = df_raw.sample(frac=1).reset_index(drop=True)

X = df_shuffled["cleaned_text"]
y = df_shuffled["label"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


clf = LogisticRegression(max_iter=1000)  # max_iter to ensure convergence
clf.fit(X_train, y_train)
        
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.88      0.93        40
           1       0.90      1.00      0.95        47

    accuracy                           0.94        87
   macro avg       0.95      0.94      0.94        87
weighted avg       0.95      0.94      0.94        87



# Lets add data, fakenews!

In [5]:
fakenews = pd.read_csv("data/fake_news.csv")
fakenews.columns = ["cleaned_text", "label"]

In [6]:
data = pd.concat([fakenews, df], ignore_index=True)


In [7]:
df_shuffled = data.sample(frac=1).reset_index(drop=True)

X = df_shuffled["cleaned_text"]
y = df_shuffled["label"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


clf = LogisticRegression(max_iter=1000)  # max_iter to ensure convergence
clf.fit(X_train, y_train)
        
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.97      0.99        34
           1       0.86      1.00      0.93        50
           2       0.98      0.85      0.91        53

    accuracy                           0.93       137
   macro avg       0.95      0.94      0.94       137
weighted avg       0.94      0.93      0.93       137



In [8]:
confusion_matrix(y_test, y_pred)

array([[33,  0,  1],
       [ 0, 50,  0],
       [ 0,  8, 45]], dtype=int64)

## Is the vectorizer that good?

In [9]:
bbc_news = pd.read_csv("data/bbc_news.csv")
bbc = bbc_news[["cleaned_text", "category_encoded"]]
bbc.columns =  ["cleaned_text", "label"]

In [10]:
data = pd.concat([bbc, data], ignore_index=True)

In [11]:
data["label"].value_counts()

label
3    100
6    100
5    100
7    100
4    100
2    100
1    100
0     74
Name: count, dtype: int64

In [12]:
df_shuffled = data.sample(frac=1).reset_index(drop=True)

X = df_shuffled["cleaned_text"]
y = df_shuffled["label"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


clf = LogisticRegression(max_iter=1000)  # max_iter to ensure convergence
clf.fit(X_train, y_train)
        
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93        22
           1       0.76      0.90      0.82        31
           2       0.88      0.77      0.82        30
           3       0.92      0.85      0.88        27
           4       0.89      1.00      0.94        24
           5       0.93      0.91      0.92        43
           6       0.86      0.92      0.89        26
           7       1.00      0.97      0.98        30

    accuracy                           0.90       233
   macro avg       0.90      0.90      0.90       233
weighted avg       0.90      0.90      0.90       233



In [13]:
confusion_matrix(y_test, y_pred)

array([[19,  2,  1,  0,  0,  0,  0,  0],
       [ 0, 28,  0,  0,  1,  0,  2,  0],
       [ 0,  4, 23,  1,  1,  1,  0,  0],
       [ 0,  2,  1, 23,  0,  1,  0,  0],
       [ 0,  0,  0,  0, 24,  0,  0,  0],
       [ 0,  0,  1,  1,  0, 39,  2,  0],
       [ 0,  0,  0,  0,  1,  1, 24,  0],
       [ 0,  1,  0,  0,  0,  0,  0, 29]], dtype=int64)

## Increasing the datas dimensions by adding more data as well as adding more things to classify is instead of worsening the model its getting improved?
The TfidfVectorizer proves to be an excellent tool for vectorizing text data, as it does a near-perfect job of capturing the essence of the information. However, it's important to note that a vectorizer like TfidfVectorizer thrives on large datasets. While adding more labels was part of the improvement, it was the accompanying increase in data that enabled the creation of a more complex sparse matrix, allowing the model to perform more efficiently and effectively.

## Lets create a transformer

In [14]:
df_deeplearning = df_shuffled

In [15]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Use tokenizer to encode
df_deeplearning['input_ids'] = df_deeplearning['cleaned_text'].apply(
    lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=128)
)

In [16]:
df_deeplearning

Unnamed: 0,cleaned_text,label,input_ids
0,Koubek suspended after drugs test Stefan Koub...,7,"[101, 12849, 12083, 5937, 6731, 2044, 5850, 32..."
1,Uganda bans Vagina Monologues Ugandas authori...,6,"[101, 10031, 7221, 2015, 12436, 20876, 18847, ..."
2,Trust ye therefore your heart ere you trust yo...,0,"[101, 3404, 6300, 3568, 2115, 2540, 9413, 2063..."
3,BBC web search aids odd queries The BBCs onli...,4,"[101, 4035, 4773, 3945, 8387, 5976, 10861, 513..."
4,Ruthenium hexafluoride also rutheniumVI fluori...,1,"[101, 7920, 18595, 2819, 2002, 18684, 10258, 1..."
...,...,...,...
769,LOS ANGELES (Reuters) - Editors Note: Attentio...,2,"[101, 3050, 3349, 1006, 26665, 1007, 1011, 101..."
770,Pupils to get anti-piracy lessons Lessons on ...,6,"[101, 7391, 2000, 2131, 3424, 1011, 24386, 822..."
771,Russia WTO talks make progress Talks on Russi...,3,"[101, 3607, 1059, 3406, 7566, 2191, 5082, 7566..."
772,iTunes now selling Band Aid song Ipod owners ...,6,"[101, 11943, 2085, 4855, 2316, 4681, 2299, 263..."


In [26]:
input_ids = torch.tensor(df_deeplearning['input_ids'].tolist(), dtype=torch.long)
labels = torch.tensor(df_deeplearning['label'].tolist(), dtype=torch.float)  # Assuming multi-label

# Create torch dataset
dataset = TensorDataset(input_ids, labels)

train_length=int(0.7* len(dataset))

test_length=len(dataset)-train_length

train_dataset, test_dataset=torch.utils.data.random_split(dataset,(train_length,test_length))


train_dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [18]:
class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_units, output_size):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Sequential(nn.Linear(input_size, hidden_units))
        self.layer2 = nn.Linear(hidden_units, output_size)


    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)  
        return x
    

model_0 = SimpleModel(input_size=128, hidden_units=16, output_size=8)

In [19]:
input_ids.size()

torch.Size([774, 128])

In [20]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_0.parameters(), lr = 0.01)

In [21]:
def training_loop(loss_fn, optimizer, model,epochs=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        train_loss = 0
        for batch, (X, y) in enumerate(train_dataloader):
            model.train()
            

            X, y = X.to(device), y.to(device)

            X = X.float() 
            y = y.long()  
            
            y_pred = model_0(X)

            loss = loss_fn(y_pred, y)
            train_loss += loss.item()

            optimizer.zero_grad()

            loss.backward()
            optimizer.step()

        
        train_loss /= len(train_dataloader)
        
        if epoch % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {train_loss:.4f}")


# Call the training loop
training_loop(loss_fn=loss_fn, optimizer=optimizer, model=model_0, epochs=100)


Epoch 1/100, Average Training Loss: 142.1564
Epoch 21/100, Average Training Loss: 2.0779
Epoch 41/100, Average Training Loss: 2.0767
Epoch 61/100, Average Training Loss: 2.0772
Epoch 81/100, Average Training Loss: 2.0766


In [22]:
class MoreAdvancedModel(nn.Module):
    def __init__(self, input_size, hidden_units, output_size):
        super().__init__()
        self.layer1 = nn.Sequential(nn.Linear(input_size, hidden_units))
        self.layer2 = nn.Linear(hidden_units, hidden_units)
        self.layer3 = nn.Linear(hidden_units, output_size)


    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.layer3(x)  
        return x
    

model_1 = MoreAdvancedModel(input_size=128, hidden_units=512, output_size=8)

In [23]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_1.parameters(), lr = 0.01)

In [24]:
training_loop(loss_fn=loss_fn, optimizer=optimizer, model=model_1, epochs=100)

Epoch 1/100, Average Training Loss: 2.0749
Epoch 21/100, Average Training Loss: 2.0749
Epoch 41/100, Average Training Loss: 2.0755
Epoch 61/100, Average Training Loss: 2.0756
Epoch 81/100, Average Training Loss: 2.0755


In [39]:
all_preds = []
true_preds = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_1.to(device)
model_1.eval()
with torch.no_grad():
    for data in test_loader:
        inputs = data[0].to(device).float()  
        outputs = model_1(inputs)
        preds = torch.argmax(outputs, dim=1)  # or outputs if you want raw logits/probs
        all_preds.append(preds)
        true_preds.append(data[1])

true_preds = torch.cat(true_preds).tolist()
final_preds = torch.cat(all_preds).tolist()

In [40]:
final_preds, true_preds

([4,
  7,
  5,
  7,
  0,
  7,
  7,
  7,
  7,
  5,
  1,
  6,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  5,
  7,
  1,
  4,
  3,
  7,
  7,
  7,
  1,
  7,
  1,
  1,
  7,
  7,
  7,
  7,
  1,
  7,
  1,
  7,
  7,
  7,
  7,
  1,
  7,
  0,
  7,
  7,
  1,
  3,
  1,
  7,
  0,
  1,
  1,
  0,
  7,
  5,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  6,
  7,
  7,
  7,
  7,
  7,
  7,
  5,
  0,
  5,
  7,
  7,
  5,
  7,
  4,
  7,
  5,
  7,
  7,
  0,
  7,
  7,
  7,
  7,
  7,
  7,
  1,
  7,
  7,
  1,
  5,
  0,
  0,
  5,
  7,
  6,
  7,
  5,
  7,
  4,
  7,
  1,
  6,
  1,
  7,
  7,
  7,
  7,
  5,
  0,
  0,
  7,
  7,
  7,
  1,
  7,
  5,
  1,
  7,
  7,
  7,
  7,
  7,
  0,
  7,
  1,
  7,
  7,
  7,
  1,
  6,
  7,
  1,
  7,
  7,
  7,
  7,
  1,
  7,
  0,
  7,
  7,
  7,
  7,
  1,
  3,
  7,
  1,
  5,
  7,
  5,
  7,
  7,
  7,
  1,
  1,
  7,
  1,
  7,
  7,
  1,
  4,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  6,
  7,
  7,
  7,
  7,
  7,
  3,
  7,
  7,
  4,
  7,
  7,
  7,
  1,
  6,
  7,
  3,
  5,
  4,
  7,
  6,
  7,
