In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [3]:
import numpy as np
import pandas as pd

from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
import torch

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import random
import os
import json

from sklearn.metrics import roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt

In [4]:
VALID_BATCH_SIZE = 128
MAX_LEN = 64

In [5]:
# load data to memory /RAM
with open('/content/drive/MyDrive/BDT_BEST_MODEL/db-tweets.json') as input_file:
  data = json.load(input_file)

len(data)

1496310

In [6]:
# toknoization 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [7]:
# check if GPU is avalible  and set device to GPU if it is avalible 
if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print("Using gpu")
else:
  device = torch.device("cpu")
  print("Using Cpu")

Using gpu


In [8]:
# load the best model
model = torch.load("/content/drive/MyDrive/BDT_BEST_MODEL/best-model.pt")

In [9]:
# move the model to device 
model = model.to(device)

In [10]:
# create custom data set class
class TweetDataset(Dataset):
  def __init__(self,data):
    self.data = data
  def __len__(self):
    return len(self.data)
  # get tweets and tweet id
  def __getitem__(self,index):
    item = self.data[index]
    tweet_id = item["_id"]
    tweet_text = item["text"]
    # tokkenization and encodding 
    encoded = tokenizer.encode(tweet_text, add_special_tokens=True)
    # padding
    encoded = torch.tensor(pad_sequences([encoded], maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")[0])
    # mask the paddeed valaue/ to avoiud the effect of zero values/padding values
    att_mask =torch.tensor([int(token_id > 0) for token_id in encoded])
    return tweet_id , encoded,att_mask

  



In [11]:
# create object
tweet_dataset = TweetDataset(data)

In [12]:
# create datase dataloader
data_loader =DataLoader(tweet_dataset,batch_size=VALID_BATCH_SIZE,shuffle=False) 

In [13]:
# AUC of the model
predictions = []
tweet_ids = []
model.eval()
with torch.no_grad():
  t = tqdm.notebook.tqdm(data_loader)
  for  ids,inputs, masks  in t:
    inputs = inputs.to(device)
    masks = masks.to(device)
    outputs = model(inputs, token_type_ids=None, attention_mask=masks)
    logits = outputs[0]
    logits = logits.detach()[:,1]
    probs = torch.sigmoid(logits).cpu()
    prds = (probs >0.5).int().tolist()
    predictions.extend(prds)
    tweet_ids.extend(ids.tolist())

HBox(children=(FloatProgress(value=0.0, max=11690.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors





In [14]:
# create dataframe 
df = pd.DataFrame(dict(id = tweet_ids,label = predictions))

In [19]:
df.tail()

Unnamed: 0,id,label
1496305,188190088583331840,0
1496306,179451893536399360,0
1496307,1235831141048844289,0
1496308,1173003480757350401,1
1496309,1030150712179666944,0


In [16]:
# frequency of lables 
df.label.value_counts()

0    1077850
1     418460
Name: label, dtype: int64

In [17]:
# save the labeles
df.to_csv("/content/drive/MyDrive/BDT_BEST_MODEL/tweets-labels.csv",index=False)