In [None]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
import numpy as np
from textblob import TextBlob
import nltk
from sklearn.ensemble import RandomForestClassifier
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torch import Tensor
from torchtext.data import get_tokenizer
import math

drive.mount('/content/drive')

df = pd.read_csv("drive/MyDrive/feature_set_new.csv")
df = df.drop('Unnamed: 0', axis=1)
df = df.dropna()
df

Mounted at /content/drive


Unnamed: 0,orig_str,label,word_count,avg_sentence_length,sentiment_polarity,stopwords_count,punctuation_count,flesch_reading_ease,flesch_kincaid_grade,perplexity,unique_pos_tags,std_unique_words,personal_pronoun_count,sentiment_subjectivity,quotation_marks_count
0,"Hey there! So, I'm super stoked to be writing ...",1,322,13.583333,0.209305,157,71,73.78,6.5,17.585706,30,1.873248,25,0.640802,2
1,Phones & Driving\n\nThere is a growing discuss...,0,401,31.153846,0.012037,195,46,56.93,13.0,35.936166,32,2.492033,20,0.540278,6
2,It Cs a common belief that setting a goal hig...,1,401,16.833333,0.223518,165,43,71.34,7.5,10.100890,28,3.626228,16,0.568167,0
3,In the current society of rapid developments a...,1,416,18.954545,0.202367,195,41,60.65,9.5,15.344154,32,2.664656,11,0.532405,0
4,Dear Principle.\n\nI think having the phone on...,0,208,42.400000,-0.026786,119,13,43.06,20.4,33.218544,27,1.737501,26,0.569643,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362929,The Electoral College has been used for years....,0,485,21.260870,0.224405,267,59,75.24,8.1,26.300501,34,3.275596,49,0.590546,2
362930,Requiring students a summer project to extend ...,0,360,22.687500,0.177562,173,42,56.89,11.0,37.513801,31,2.019458,31,0.518107,0
362931,"In the ""challenge of exploring Venus"" the auth...",0,430,35.833333,0.098493,225,24,60.52,13.7,54.269715,32,2.795667,20,0.467035,6
362932,"In the modern day, technology has become an in...",1,349,24.928571,0.058025,119,38,31.72,14.4,31.893595,31,0.935812,13,0.449903,0


In [None]:
def predict_with_random_forest(row):
  input_data = row.drop(columns=['label', 'orig_str'])[2:].values.reshape(1, -1)
  prediction = rf_model.predict(input_data)[0]
  return prediction

with open('drive/MyDrive/random_forest_model.pkl', 'rb') as f:
  rf_model = pickle.load(f)

rf_model

In [None]:
def collate_fn_padding(batch):

    # obtain input_ids and labels from batch, convert to torch tensors
    input_ids = [torch.tensor(sample['input_id']).long().to(device) for sample in batch]
    labels = torch.tensor([sample['label'] for sample in batch]).float().to(device)

    # pad input sequences to maximum sequence length in batch
    max_length = max(len(input_id) for input_id in input_ids)
    padded_input_id = [F.pad(input=input_id, pad=(0, max_length - len(input_id)), mode='constant', value=0) for input_id in input_ids]

    # put the tensors together in a single batch
    padded_input_ids = torch.stack(padded_input_id)

    # convert labels to 2d float values for compatibility
    labels = labels.float()
    labels = labels.unsqueeze(1)

    return padded_input_ids, labels

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=dropout)


    # forward pass
    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_length, embedding_dim]``
        """
        # obtain shape of input tensor
        batch_size, seq_length, embedding_dim = x.size()

        # calculate positional encoding, results in [1, seq_length, 1] shape
        position = torch.arange(seq_length).unsqueeze(0).unsqueeze(-1)

        # calculate divisor term for sine and cosine functions
        div_term = torch.exp(torch.arange(0, self.d_model, 2) * (-math.log(10000.0) / self.d_model))

        # initialize tensor of 0s
        pe = torch.zeros(batch_size, seq_length, self.d_model)

        # calculate sine for even indices and cosine for odd
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)

        # add positional encoding to input tensor, set to same device
        x = x + pe.to(device)

        # apply dropout and return
        return self.dropout(x)



# define class to build custom transformer model
class TransformerModel(nn.Module):
    def __init__(
        self,
        num_embeddings, # size of vocab
        embedding_dim, # embedding dimensions
        d_model, # number of expected input features
        nhead, # number of heads for multi-attention
        dim_feedforward, # dimension of feedforward network
        dropout, # dropout value
        activation, # intermediate layer activation function
        num_layers # number of transformer layers
    ):
        super().__init__()

        # embedding layer
        self.embedding_layer = nn.Embedding(num_embeddings, embedding_dim)

        # positional encoding module
        # NOTE* - Module use obtained from:
        # https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # transformer layers
        self.transformer_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model = d_model, # number of expected input features
                nhead = nhead, # number of heads for multi-attention
                dim_feedforward=dim_feedforward, # dimension of feedforward network
                dropout=dropout, # dropout value
                activation=activation, # intermediate layer activation function
                batch_first=True # (batch, seq, feature) format

            ),
            num_layers=num_layers # number of transformer layers
        )

        # predictive layer, 1 output for binary classification
        self.predictive_layer = nn.Linear(embedding_dim, 1)

        # sigmoid activation function for output between 0 and 1
        self.sigmoid_activation = nn.Sigmoid()


    # function for forward pass
    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.pos_encoder(x)
        x = self.transformer_layers(x)
        x = torch.mean(x, dim=1) # mean pooling
        x = self.predictive_layer(x)
        x = self.sigmoid_activation(x)
        return x

# set to True if needing to load model, False otherwise
load_model = True

# path to model to load
model_path = 'drive/MyDrive/custom transformer/custom_model.pt'

# load model
if load_model == True:
    if torch.cuda.is_available():
        transformer_model = torch.load(model_path) # GPU
    else:
        transformer_model = torch.load(model_path, map_location=torch.device('cpu')) # CPU

# set model to eval mode
transformer_model.eval()

TransformerModel(
  (embedding_layer): Embedding(208251, 768)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_layers): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (predictive_layer): Linear(in_features=768, out_features=1, bias=True)
  (sigmoid_activation): Sigmoid()
)

In [None]:
with open('drive/MyDrive/custom transformer/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)


tokenizer = get_tokenizer("basic_english")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_with_transformer(model, text, tokenizer, vocab):
    # tokenize the text
    tokenized_text = tokenizer(text)
    # generate input_ids using vocab
    input_ids = [vocab[token] if token in vocab else 0 for token in tokenized_text]
    # convert to tensor, set to device
    input_tensor = torch.tensor([input_ids]).to(device)
    # generate prediction
    prediction = model(input_tensor)
    return prediction.item()

raw_pred = predict_with_transformer(transformer_model, df["orig_str"][0], tokenizer, vocab)
pred = round(raw_pred)
print(pred)

1


In [None]:
df_zeros = df[df['label'] == 0]
df_ones = df[df['label'] == 1]

df_zeros_sample = df_zeros.sample(n=50000)
df_ones_sample = df_ones.sample(n=50000)

df_sample = pd.concat([df_zeros_sample, df_ones_sample])
df_sample = df_sample.sample(frac=1)


1

In [None]:
df['rf_prediction'] = df.apply(predict_with_random_forest, axis=1)
df['transformer_prediction'] = df.apply(lambda row: predict_with_transformer(transformer_model, row['orig_str'], tokenizer, vocab), axis=1)
df

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Unnamed: 0,orig_str,label,word_count,avg_sentence_length,sentiment_polarity,stopwords_count,punctuation_count,flesch_reading_ease,flesch_kincaid_grade,perplexity,unique_pos_tags,std_unique_words,personal_pronoun_count,sentiment_subjectivity,quotation_marks_count,rf_prediction,transformer_prediction
0,"Hey there! So, I'm super stoked to be writing ...",1,322,13.583333,0.209305,157,71,73.78,6.5,17.585706,30,1.873248,25,0.640802,2,1,9.999505e-01
1,Phones & Driving\n\nThere is a growing discuss...,0,401,31.153846,0.012037,195,46,56.93,13.0,35.936166,32,2.492033,20,0.540278,6,0,2.101504e-04
2,It Cs a common belief that setting a goal hig...,1,401,16.833333,0.223518,165,43,71.34,7.5,10.100890,28,3.626228,16,0.568167,0,1,6.012301e-01
3,In the current society of rapid developments a...,1,416,18.954545,0.202367,195,41,60.65,9.5,15.344154,32,2.664656,11,0.532405,0,1,1.250233e-06
4,Dear Principle.\n\nI think having the phone on...,0,208,42.400000,-0.026786,119,13,43.06,20.4,33.218544,27,1.737501,26,0.569643,0,0,1.732698e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362929,The Electoral College has been used for years....,0,485,21.260870,0.224405,267,59,75.24,8.1,26.300501,34,3.275596,49,0.590546,2,0,2.046388e-06
362930,Requiring students a summer project to extend ...,0,360,22.687500,0.177562,173,42,56.89,11.0,37.513801,31,2.019458,31,0.518107,0,0,1.249591e-03
362931,"In the ""challenge of exploring Venus"" the auth...",0,430,35.833333,0.098493,225,24,60.52,13.7,54.269715,32,2.795667,20,0.467035,6,0,3.241227e-07
362932,"In the modern day, technology has become an in...",1,349,24.928571,0.058025,119,38,31.72,14.4,31.893595,31,0.935812,13,0.449903,0,1,6.251166e-02


In [None]:
df_sample.to_csv("drive/MyDrive/sample_with_preds.csv")

In [None]:
# calculate accuracy for random forest and transformer predictions
df_sample["transformer_prediction"] = df_sample["transformer_prediction"].apply(lambda x: round(x))
df_sample["rf_prediction"] = df_sample["rf_prediction"].apply(lambda x: round(x))

rf_accuracy = (df_sample['label'] == df_sample['rf_prediction']).mean()
transformer_accuracy = (df_sample['label'] == df_sample['transformer_prediction']).mean()

# print the accuracy scores
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print(f"Transformer Accuracy: {transformer_accuracy * 100:.2f}%")

Random Forest Accuracy: 99.56%
Transformer Accuracy: 83.89%


In [None]:
# Create a new column that indicates whether the transformer prediction is correct but the random forest prediction is incorrect
df_sample['transformer_correct_rf_incorrect'] = (df_sample['label'] == df_sample['transformer_prediction']) & (df_sample['label'] != df_sample['rf_prediction'])

# Count the number of rows where the transformer prediction is correct but the random forest prediction is incorrect
num_rows_transformer_correct_rf_incorrect = df_sample['transformer_correct_rf_incorrect'].sum()

# Print the number of rows
print(f"Number of rows where the transformer prediction is correct but the random forest prediction is incorrect: {num_rows_transformer_correct_rf_incorrect}")


Number of rows where the transformer prediction is correct but the random forest prediction is incorrect: 283


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Extract features and target
X = df_sample[['rf_prediction', 'transformer_prediction']]
y = df_sample['label']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)

# Print the metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")


Accuracy: 99.54%
Precision: 99.65%
Recall: 99.44%


In [None]:
df.to_csv("drive/MyDrive/feature_set_with_preds.csv")

In [None]:
fswp = pd.read_csv("drive/MyDrive/feature_set_with_preds.csv")
new_df = fswp[["orig_str", "rf_prediction", "transformer_prediction", "label"]]
new_df

Unnamed: 0,orig_str,rf_prediction,transformer_prediction,label
0,"Hey there! So, I'm super stoked to be writing ...",1,9.999505e-01,1
1,Phones & Driving\n\nThere is a growing discuss...,0,2.101504e-04,0
2,It Cs a common belief that setting a goal hig...,1,6.012301e-01,1
3,In the current society of rapid developments a...,1,1.250233e-06,1
4,Dear Principle.\n\nI think having the phone on...,0,1.732698e-07,0
...,...,...,...,...
362927,The Electoral College has been used for years....,0,2.046388e-06,0
362928,Requiring students a summer project to extend ...,0,1.249591e-03,0
362929,"In the ""challenge of exploring Venus"" the auth...",0,3.241227e-07,0
362930,"In the modern day, technology has become an in...",1,6.251166e-02,1


In [None]:
new_df['string_length'] = new_df['orig_str'].str.len()
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['string_length'] = new_df['orig_str'].str.len()


Unnamed: 0,orig_str,rf_prediction,transformer_prediction,label,string_length
0,"Hey there! So, I'm super stoked to be writing ...",1,9.999505e-01,1,1771
1,Phones & Driving\n\nThere is a growing discuss...,0,2.101504e-04,0,2286
2,It Cs a common belief that setting a goal hig...,1,6.012301e-01,1,2288
3,In the current society of rapid developments a...,1,1.250233e-06,1,2411
4,Dear Principle.\n\nI think having the phone on...,0,1.732698e-07,0,1072
...,...,...,...,...,...
362927,The Electoral College has been used for years....,0,2.046388e-06,0,2706
362928,Requiring students a summer project to extend ...,0,1.249591e-03,0,2221
362929,"In the ""challenge of exploring Venus"" the auth...",0,3.241227e-07,0,2292
362930,"In the modern day, technology has become an in...",1,6.251166e-02,1,2352


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Extract features and target
X = new_df.drop(['orig_str', 'label'], axis=1)
y = new_df['label']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)

# Print the metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")


Accuracy: 99.27%
Precision: 99.29%
Recall: 99.25%
