In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
SOURCE_DIR = '/content/gdrive/MyDrive/mahsa_amini_data.zip'

In [None]:
!unzip '/content/gdrive/MyDrive/mahsa_amini_data.zip'

Archive:  /content/gdrive/MyDrive/mahsa_amini_data.zip
replace mahsa_amini_data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: mahsa_amini_data.csv    


In [10]:
path = '/content/mahsa_amini_data.csv'

In [3]:
!pip install transformers -q

In [4]:
!pip install datasets -q

In [27]:
import torch
import re
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM
from collections import OrderedDict
from datasets import load_dataset
from collections import Counter

In [6]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [7]:
def find_k_nearest_neighbors(word, embedding_dict, k):
  words_cosine_similarity = dict()
  for token in embedding_dict.keys():
    words_cosine_similarity[token] = cos(embedding_dict[word], embedding_dict[token]).item()
  words_cosine_similarity = dict(sorted(words_cosine_similarity.items(), key=lambda item: item[1]))
  return list(words_cosine_similarity.keys())[-k:][::-1]

def delete_hashtag_usernames(text):
  try:
    result = []
    for word in text.split():
      if word[0] not in ['@', '#']:
        result.append(word)
    return ' '.join(result)
  except:
    return ''

def delete_url(text):
  text = re.sub(r'http\S+', '', text)
  return text

In [8]:
word = "زندگی"
k = 10

#0. Data preprocessing

In [11]:
# 1. extract all tweets from files and save them in memory base on each year.
# 2. remove urls, hashtags and usernames.
corpus = []
my_file = pd.read_csv(path)['Text']

In [12]:
for line in my_file:
  temp = delete_hashtag_usernames(line)
  corpus.append(delete_url(temp))

#1. One hot encoding

In [20]:
# 1. find one hot encoding of each word for each year
# 2. find 10 nearest words from "زندگی"
distinct_words = set()
for line in corpus:
  for w in line.split(' '):
    distinct_words.add(w)

l = len(distinct_words)
embedding_dict = {}
for i, w in enumerate(distinct_words):
  v1 = np.zeros(l)
  v1[i] = 1
  v2 = torch.from_numpy(v1)
  embedding_dict[w] = v2

neighbors = find_k_nearest_neighbors(word, embedding_dict, k)
neighbors

['آزادی',
 'دادین.',
 'تحریکشون',
 '«جلاد',
 'شدین؟؟',
 'رهام',
 'نگرانم.',
 'عکس:',
 'گینسه',
 'دومیش']

###**Advantages:**
- One-Hot-Encoding has the advantage that the result is binary rather than ordinal and that everything sits in an orthogonal vector space.

###**Disadvantages:**
- The disadvantage is that for high cardinality, the feature space can really blow up quickly and you start fighting with the curse of dimensionality and the vectors is so sparse.
- If we add a document to our corpus, every vectors will change!

#2. TF-IDF

In [None]:
# 1. find the TF-IDF of all tweets.
# 2. choose one tweets randomly.
# 3. find 10 nearest tweets from chosen tweet.

n = len(corpus)
m = len(distinct_words)

word2ind = {}
df = {}
tf_idf={}
words_per_doc = []

for i, w in enumerate(distinct_words):
  word2ind[w]= i

for d in corpus:
  for w in d.split(' '):
    if w in df:
      df[w] += 1
    else:
      df[w] = 1

l = len(word2ind)

for d in corpus:
  words_per_doc = d.split(' ')
  tf_idf[d] = np.zeros(l)
  c = Counter(words_per_doc)
  word_per_doc_items = list(c.items())
  all_words = len(words_per_doc)
  for item in word_per_doc_items:
    tf = item[1] / all_words
    idf = np.log10( n / df[item[0]])  
    i = word2ind[item[0]]
    tf_idf[d][i] = tf * idf

embedding_dict = {}
for d in corpus:
  embedding_dict[d] = torch.from_numpy(tf_idf[d])

t = corpus[217]
print("Chosen Tweet:" + t + "\n")
neighbors = find_k_nearest_neighbors(t, embedding_dict, k)
neighbors

Chosen Tweet:برای اهواز



['برای اهواز',
 'ادامه اعتراضات وسط خیابان\u200cهای اهواز جمعه ۸ مهر، اعتراضات سراسری اهواز ',
 'دمتون گرم بچه های اهواز این از امشب ،امیدواریم فردا همه محلات اهواز رو درکف خیابان ببینیم ',
 'Now in مردم معترض در اهواز به خیابان\u200cها رفتند ',
 'از طرف مهدی دریس مولایی آبان ۹۸ 🖤 اهواز',
 'من تا حالا نمیدونستم حزبی به نام تضامن دموکراتیک اهواز (عربستان) با ۶۰۰ فالوئر وجود خارجی داره ولی به برکت شبکه ایران اینترنشنال باهاشون آشنا شدم ',
 'برای برای',
 'برای برای برای',
 'برای',
 'من اهل سنندج هستم من اهل اهواز هستم من اهل تهران هستم من اهل عسلویه هستم من اهل اشنویه هستم من ایرانیم ،ایرانی']

###**Advantages:**
- Easy to compute
- You have some basic metric to extract the most descriptive terms in a document
- You can easily compute the similarity between 2 documents using it

###**Disadvantages:**
- TF-IDF is based on the bag-of-words (BoW) model, therefore it does not capture position in text, semantics, co-occurrences in different documents, etc.
- For this reason, TF-IDF is only useful as a lexical level feature.
- Cannot capture semantics (e.g. as compared to topic models, word embeddings)
- Because tf-idf can experience the curse of dimensionality, it can also experience memory inefficiency. 

#3. Word2Vec

In [13]:
# 1. train a word2vec model base on all tweets for each year.
# 2. find 10 nearest words from "زندگی"

data = []
for d in corpus:
  data.append(d.split(' '))

model = Word2Vec(data, min_count = 1, size = 100, window = 5)
nearest_words = model.wv.most_similar(word)
nearest_words

[('زن', 0.994603157043457),
 ('زندگی،', 0.993238091468811),
 ('زن،', 0.9929890632629395),
 ('آزادی.', 0.9924159646034241),
 ('آزادی', 0.9920729398727417),
 ('میهن', 0.9915899038314819),
 ('،زندگی', 0.991539478302002),
 ('ازادی', 0.9906849265098572),
 ('هستیم.', 0.9897983074188232),
 ('امید', 0.9890443086624146)]

###**Advantages:**
- The idea is very intuitive, which transforms the unlabled raw corpus into labeled data (by mapping the target word to its context word), and learns the representation of words in a classification task.
- The data can be fed into the model in an online way and needs little preprocessing, thus requires little memory.
- The mapping between the target word to its context word implicitly embeds the sub-linear relationship into the vector space of words, so that relationships like “king:man as queen:woman” can be infered by word vectors.

###**Disadvantages:**
- The sub-linear relationships are not explicitly defined. There is little theoretical support behind such characteristic.
- The model could be very difficult to train if use the softmax function, since the number of categories is too large (the size of vocabulary).

#4. Contextualized embedding

In [15]:
model_checkpoint = "HooshvareLab/bert-base-parsbert-uncased"

In [16]:
def bert_text_preparation(text, tokenizer):
  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1]*len(indexed_tokens)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensor = torch.tensor([segments_ids])
  return tokenized_text, tokens_tensor, segments_tensor

In [17]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
  with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs[2]
  token_embeddings = torch.stack(hidden_states, dim=0)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  token_embeddings = token_embeddings.permute(1,0,2)
  token_vecs_sum = []
  for token in token_embeddings:
    sum_vec = torch.sum(token[-4:], dim=0)
    token_vecs_sum.append(sum_vec)
  return token_vecs_sum

In [28]:
# 1. fine tune a bert model base on all tweets for each year.
# 2. find 10 nearest words from "زندگی"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
context_embeddings = []
context_tokens = []
for sentence in corpus[:100]:
  tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
  list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
  tokens = OrderedDict()
  for token in tokenized_text[1:-1]:
    if token in tokens:
      tokens[token] += 1
    else:
      tokens[token] = 1
    token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
    current_index = token_indices[tokens[token]-1]
    token_vec = list_token_embeddings[current_index]
    context_tokens.append(token)
    context_embeddings.append(token_vec)

In [23]:
distinct_words = set()
for w in context_tokens:
  distinct_words.add(w)
len(distinct_words)

710

In [None]:
embedding_dict = {}
for w1 in distinct_words:
  sum = 0
  c = 0
  for i, w2 in enumerate(context_tokens):
    if(w1 == w2):
      sum += context_embeddings[i]
      c += 1
  embedding_dict[w1] = sum / c

In [25]:
find_k_nearest_neighbors(word, embedding_dict, k)

['زندگی',
 'ازادی',
 'زن',
 'مبارزه',
 'شجاعت',
 'ازدواج',
 'ایران',
 'مهسا',
 'برای',
 'وطنم']

###**Advantages:**
- BERT works well for task-specific models. The state of the art model, BERT, has been trained on a large corpus, making it easier for smaller, more defined nlp tasks.

- Metrics can be fine-tuned and be used immediately.

- The accuracy of the model is outstanding because it is frequently updated. You can achieve this with successful fine-tuning training.

- The BERT model is available and pre-trained in more than 100 languages. This can be useful for projects that are not English-based.


###**Disadvantages:**
- The main drawbacks of using BERT and other big neural language models is the computational resources needed to train/fine-tune and make inferences.

- Most of the drawbacks of BERT can be linked to its size. While training the data on a large corpus significantly helps how the computer predicts and learns, there is also another side to it. They include:

- The model is large because of the training structure and corpus.

- It is slow to train because it is big and there are a lot of weights to update.

- It is expensive. It requires more computation because of its size, which comes at a cost.