In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [4]:
data = pd.read_excel("full_dataset.xlsx")

In [None]:
data

#### Exploration

In [None]:
data.rId.value_counts()

In [None]:
#max no. of sentences per text
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=data, x='rId')
plt.xticks([])

plt.show()

In [None]:
df_infos = pd.DataFrame(data.rId.value_counts())

In [None]:
df_infos['rId'].mean()

#### Processing

In [None]:
!pip install transformers

In [5]:
data_sent = data[['_id', 'sentence', 'rId']]

In [None]:
data_sent['sentence'].isna().sum()

In [7]:
data_sent = data_sent.dropna(subset=['sentence'])

In [None]:
data_sent.shape

#### Tokenisation & encoding

In [None]:
import numpy as np
from transformers import BertTokenizer

model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = BertTokenizer.from_pretrained(model_name)

sentences = data_sent.sentence.tolist()

def tokenize(sent) :
  sent_tokeinzed = tokenizer.tokenize(sent)
  sent_iob = ['1'] + ['0'] * (len(sent_tokeinzed)-1)
  return  sent_tokeinzed, sent_iob

data_sent['sent_tokenized'], data_sent['sent_iob'] = zip(*data_sent['sentence'].apply(tokenize))

In [None]:
all_pargraphs = data_sent.rId.unique().tolist()
len(all_pargraphs)

#### Preprocessing (didn't improve the results)

In [None]:
!pip install pyarabic

In [None]:
import pyarabic.araby as araby

def clean_text(text):
    ## remove extra whitespace
    text = re.sub('\s+', ' ', str(text))
    ## Remove Tashkeel
    text = araby.strip_diacritics(str(text))
    ## remove extra whitespace
    text = re.sub('\s+', ' ', str(text))
    ## Remove numbers
    text = re.sub("\d+", " ", text)
    ## Remove Non-Arabic Letters
    text = re.sub('[A-Za-z]+',' ',text)
    return text

data_sent['sentence']=data_sent['sentence'].apply(clean_text)

#### Embedding & concatenating

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
import torch

device = torch.device("cpu")

if torch.cuda.is_available():
   print("Training on GPU")
   device = torch.device("cuda:0")

In [None]:
from transformers import BertTokenizer, AutoTokenizer, AutoModel

model_name = "aubmindlab/bert-base-arabertv02"
model = AutoModel.from_pretrained(model_name).to(device)

#### Generate 10K paragraphs

In [None]:
# concatenation

import random
from tqdm import tqdm

ids = []
sentences_concat = []
sentences_token_concat = []
iob_encoding_concat = []

for paragraph_id in tqdm(data_sent.rId.unique()):
    data_paragraph = data_sent[data_sent['rId'] == paragraph_id]
    phrases = data_paragraph['sentence'].tolist()

    while phrases:
        num = random.choice([3, 4, 5, 6, 7, 8, 9])
        df_paragraph = data_paragraph.iloc[:num]

        phrase_concat = ' '.join(df_paragraph['sentence'])
        phrase_token_list = sum(df_paragraph['sent_tokenized'].tolist(), [])
        iob_list = sum(df_paragraph['sent_iob'].tolist(), [])
        id_ = df_paragraph['rId'].iloc[0]
        inputs = tokenizer(phrase_concat, return_tensors="pt", is_split_into_words=True).to(device)

        # Check if the length exceeds 512
        while len(inputs['input_ids'][0]) > 512:
            num -= 1  # reduce the threshold
            if num < 1:  # to avoid an infinite loop
                break
            df_paragraph = data_paragraph.iloc[:num]
            phrase_concat = ' '.join(df_paragraph['sentence'])

            inputs = tokenizer(phrase_concat, return_tensors="pt", is_split_into_words=True,truncation=True,max_length=512).to(device)

        outputs = model(**inputs)
        sentences_concat.append(phrase_concat)
        sentences_token_concat.append(phrase_token_list)
        iob_encoding_concat.append(iob_list)
        ids.append(id_)

        data_paragraph = data_paragraph.iloc[num:]
        phrases = data_paragraph['sentence'].tolist()

#### Generate 400K paragraphs

In [None]:
import random
from tqdm import tqdm

ids = []
sentences_concat = []
sentences_token_concat = []

for paragraph_id in tqdm(data_sent.rId.unique()):
    data_paragraph = data_sent[data_sent['rId'] == paragraph_id]
    phrases = data_paragraph['sentence'].tolist()
    while (len(phrases) >= 3):
        for num in range(3, 10):
            df_paragraph = data_paragraph.iloc[:num]

            phrase_concat = ' '.join(df_paragraph['sentence'])
            phrase_token_list = sum(df_paragraph['sent_tokenized'].tolist(), [])
            iob_list = sum(df_paragraph['sent_iob'].tolist(), [])
            id_ = df_paragraph['rId'].iloc[0]

            if len(iob_list) > 510:
                continue

            sentences_concat.append(phrase_concat)
            sentences_token_concat.append(phrase_token_list)
            iob_encoding_concat.append(iob_list)
            ids.append(id_)

        data_paragraph = data_paragraph.iloc[1:]
        phrases = data_paragraph['sentence'].tolist()

In [None]:
data_sent_concat = pd.DataFrame()
data_sent_concat['sent_concat'] = sentences_concat
data_sent_concat['sent_token_concat'] = sentences_token_concat
data_sent_concat['iob_concat'] = iob_encoding_concat
data_sent_concat['rId'] = ids

In [None]:
data_sent_concat.head()

#### Train Bi-LSTM (Pytorch)

In [1]:
import ast
data_sent_concat['iob_concat'] = data_sent_concat['iob_concat'].apply(lambda x: list(map(int, ast.literal_eval(x))))

In [13]:
data_sent_concat["sent_concat"] = data_sent_concat["sent_concat"].astype(str)
X_padded = data_sent_concat['sent_concat'].to_list()

In [None]:
from transformers import AutoModel

model_name="aubmindlab/bert-base-arabertv02"
bert = AutoModel.from_pretrained(model_name)
maxlen = bert.config.max_position_embeddings - 2 # 2 for the special tokens [CLS] and [SEP]

#maxlen = 510

In [None]:
maxlen

In [None]:
#data_sent_concat.head()

In [15]:
import ast
data_sent_concat['iob_concat'] = data_sent_concat['iob_concat'].apply(lambda x: list(map(int, ast.literal_eval(x))))

In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

y_padded = pad_sequences(data_sent_concat['iob_concat'], padding='post', dtype='int32', maxlen=maxlen)
y_padded = np.expand_dims(y_padded, axis=-1)

In [None]:
y_padded.shape

In [32]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2, random_state=42)

len_x = len(X_padded)
num = int(len(X_padded)*0.9)
X_train = X_padded[:num]
y_train = y_padded[:num]
X_test = X_padded[num:]
y_test = y_padded[num:]

In [None]:
len(X_train)

In [None]:
len(X_test)

In [None]:
X_padded[0]

In [22]:
import torch
# from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
tokenizer_kwargs = {
    "return_tensors": "pt",
    "padding": "max_length",
    "truncation": True,
    "max_length": 512
}

class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer, tokenizer_kwargs):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.tokenizer_kwargs = tokenizer_kwargs

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.X[idx], **self.tokenizer_kwargs)
        return inputs, [self.y[idx]]

    def __getitems__(self, idxs):
        inputs = self.tokenizer([self.X[idx] for idx in idxs], **self.tokenizer_kwargs)
        return [(inputs, self.y[idxs])]


def collate_fn(batch):
    X = {
        "input_ids": [],
        "attention_mask": [],
        "token_type_ids": []
    }
    y = []
    for x, yi in batch:
        X["input_ids"].append(x["input_ids"])
        X["attention_mask"].append(x["attention_mask"])
        X["token_type_ids"].append(x["token_type_ids"])
        y.append(yi)
    X["input_ids"] = torch.cat(X["input_ids"], dim=0)
    X["attention_mask"] = torch.cat(X["attention_mask"], dim=0)
    X["token_type_ids"] = torch.cat(X["token_type_ids"], dim=0)
    y = torch.cat(y, dim=0)
    return X, y

# Convert data to tensors
# X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoaders
# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_dataset = CustomDataset(X_train, y_train_tensor, tokenizer, tokenizer_kwargs)
test_dataset = CustomDataset(X_test, y_test_tensor, tokenizer, tokenizer_kwargs)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn) #batch_size=16

In [23]:
import torch.nn as nn

from transformers import BertTokenizer, AutoTokenizer, AutoModel



class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, model_name="aubmindlab/bert-base-arabertv02"):
        super(BiLSTMModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        with torch.no_grad():
            outputs = self.bert(**x)
            x = outputs.last_hidden_state[:, 1:-1, :]  # embeddings
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)
        return self.sigmoid(output)


# input_dim = X_train.shape[2]  # Assuming that X_train has the form (batch_size, seq_length, input_dim)
from transformers import AutoModel

model_name="aubmindlab/bert-base-arabertv02"
bert = AutoModel.from_pretrained(model_name)
input_dim = bert.config.hidden_size


#input_dim = 768
hidden_dim = 32
lstm_model = BiLSTMModel(input_dim, hidden_dim)


In [None]:
input_dim

In [None]:
import torch

device = torch.device("cpu")

if torch.cuda.is_available():
   print("Training on GPU")
   device = torch.device("cuda:0")

In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm_model.to(device)

In [None]:

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

num_epochs = 5

for epoch in range(num_epochs):
    lstm_model.train()
    for batch_X, batch_y in tqdm(train_loader):
        # batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        batch_X = {k: v.to(device) for k, v in batch_X.items()}
        batch_y = batch_y.to(device)

        # Forward pass
        outputs = lstm_model(batch_X).squeeze()
        loss = criterion(outputs, batch_y.squeeze())

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
# Save model:
torch.save(lstm_model, '/content/gdrive/MyDrive/model_lstm.pth')

In [None]:
torch.cuda.empty_cache()
#gc.collect()
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:5"

In [None]:
lstm_model = torch.load('/content/gdrive/MyDrive/model_lstm.pth')

In [None]:
lstm_model.to(device)

### Splitting score

In [None]:
from tqdm import tqdm

all_splits = []
all_matches = []

for index in tqdm(range(len(X_test))) :
  single_example = X_test[index]
  true_label = y_test_tensor[index]

  single_example = tokenizer(single_example, **tokenizer_kwargs)

  lstm_model.eval()

  single_example = {k: v.to(device) for k, v in single_example.items()}

  with torch.no_grad():
      output = lstm_model(single_example)
      prediction = (output > 0.5).float()  # Binarize the prediction

  splits = np.sum(true_label.numpy()) - 1

  # Ignore first element
  y_true = true_label.numpy()[1:]
  y_pred = prediction.detach().cpu().numpy()[0][1:]

# Calculate the number of matches of 1
  matches = np.sum((y_true == 1) & (y_pred == 1))

  all_splits.append(splits)
  all_matches.append(matches)

In [None]:
sum(all_matches)/sum(all_splits)

## Inference



In [None]:
# !pip install transformers

#### Import some test phrases

In [None]:
# import data
import pandas as pd
data = pd.read_excel("full_dataset.xlsx")
data_sent = data[['_id', 'sentence', 'rId']]
data_sent = data_sent.dropna(subset=['sentence'])
data_sent.head()

In [None]:
# GPU:
import torch

device = torch.device("cpu")

if torch.cuda.is_available():
   print("Training on GPU")
   device = torch.device("cuda:0")

In [None]:
'''
def split_sent(sent) :
  # Encode:
  inputs = tokenizer(sent, **tokenizer_kwargs).to(device)

  #Predict lstm:
  with torch.no_grad():
      output = lstm_model(inputs)
      prediction = (output > 0.5).float()


  # Split and decode each segment:
  indices = (prediction == 1).nonzero(as_tuple=True)[1].to(device)

  indices = torch.cat((indices, torch.tensor([inputs["input_ids"].shape[0]]).to(device)))

  segments = []
  for i in range(len(indices) - 1):
      start_index = indices[i]
      end_index = indices[i+1]
      segment_ids = inputs["input_ids"][0][start_index:end_index]
      segments.append(tokenizer.decode(segment_ids, skip_special_tokens=True))

  return(segments)

In [None]:
def split_sent(sent) :
  # Encode:
  inputs = tokenizer(sent, return_tensors="pt", is_split_into_words=True).to(device)
  outputs = bert_model(**inputs)
  embeddings = outputs.last_hidden_state[:, 1:-1, :].detach().cpu().numpy()

  # Padding:
  truncated = embeddings[:maxlen]
  y_padded = np.zeros((maxlen, len(embeddings[0][0])))
  y_padded[:len(truncated[0])] = truncated[0]
  y_padded = torch.tensor(y_padded, dtype=torch.float32).to(device)

  #Predict lstm:
  with torch.no_grad():
      output = lstm_model(y_padded)
      prediction = (output > 0.5).float()


  # Split and decode each segment:
  indices = (prediction == 1).nonzero(as_tuple=True)[0].to(device)

  indices = torch.cat((indices, torch.tensor([inputs["input_ids"].shape[1]]).to(device)))

  segments = []
  for i in range(len(indices) - 1):
      start_index = indices[i]
      end_index = indices[i+1]
      segment_ids = inputs["input_ids"][0][start_index:end_index]
      segments.append(tokenizer.decode(segment_ids, skip_special_tokens=True))

  return(segments)

In [None]:
test_seq = data_sent_concat['sent_concat'][0]
print(test_seq)

وتسويق المؤثرين هو شكل من أشكال التعاون الذي يحدث عندما تشترك العلامات التجارية مع المؤثرين للترويج لمنتج أو خدمة أو حملة. ويعمل هذا النوع من التسويق بشكل فعال؛ لأن المؤثرين يسيطرون على جمهورك المستهدف؛ حيث إن متابعيهم يثقون بهم بالفعل ويتطلعون إليهم للحصول على توصيات. وبمجرد العثور على المؤثر الذي يشارك المحتوى ذا الصلة ويبدو أنه مناسب لعلامتك التجارية يجب عليك بعد ذلك إقناعه بالعمل معك. وتشير التقديرات الحالية إلى أن سوق تسويق المؤثرين قد يصل إلى بين 5 و10 مليارات دولار بنهاية عام 2020. اقرأ أيضًا: 3 قواعد لتسويق مشروع ناجح.. كيف ترضي عملاءك؟ الفرق بين التسويق الإلكتروني وتسويق المؤثرين يمكننا تلخيص الفرق بين التسويق الإلكتروني وتسويق المؤثرين كما يلي: التسويق الإلكتروني أعم التسويق الإلكتروني جهد عام وشاق، فهو يشمل التسويق عبر محركات البحث، وعبر البريد الإلكتروني، ومن خلال وسائل التواصل الاجتماعي، بل عبر المؤثرين أنفسهم.


In [None]:
# Predicted phrases:
split_sent(test_seq)