<a href="https://colab.research.google.com/github/liliDev-ing/testRepo/blob/master/poetry_wasaj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create custom Tokenizer

In [None]:
!pip install datasets
from datasets import load_dataset
import datasets
datasets.list_datasets()
#use a sample of 1000 words from the arabic oscar dataset
dataset = load_dataset(
   'oscar', 
   'unshuffled_deduplicated_ar' ,
    split='train[:1000]'
   )


In [None]:
import os
os.mkdir('./source_poetry')
os.mkdir('./image')
os.mkdir('./topics')

In [None]:
#download the sample dataset into local folder

from tqdm.auto import tqdm
text_data=[]
file_count=0
for sample in tqdm(dataset):
  sample= sample['text'].replace('\n', ' ')
  text_data.append(sample)
  #create .txt files from the oscar dataset
  if len(text_data)== 1_000:
    with open(f'./source_poetry_{file_count}.txt','w', encoding='utf-8')as fp:
      fp.write('\n'.join(text_data))
    text_data=[]
    file_count+=1

In [None]:
from pathlib import Path
paths= [str(x) for x in Path('./source_poetry').glob('**/*.txt')]

In [None]:
!pip install tokenizers
from tokenizers import BertWordPieceTokenizer

In [None]:
#create the Berttokenizer
tokenizer= BertWordPieceTokenizer(
    clean_text= True,
    handle_chinese_chars= False,
    strip_accents= False
)

In [None]:
#train the tokenizer and save it
tokenizer.train( files= paths,
                vocab_size= 10_000,
                min_frequency= 2,
                special_tokens= ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
                limit_alphabet=1000,
                wordpieces_prefix='##')
os.mkdir('./new_tokenizer')
tokenizer.save_model('./new_tokenizer')

In [None]:
#testing the tokenizer
!pip install transformers
from transformers import BertTokenizer
tokenizer= BertTokenizer.from_pretrained('./new_tokenizer')
tokenizer('ما بال عينك منها الماء ينسكب.')

# similarity between sentences

In [None]:
#use the created tokenizer with pytorch to compute similarity between sentences
#here is an example
sentences =["أَراني إِذا هَوَّمتُ يا مَيُّ زُرتِني فَيا نِعمَتا لَو أَنَّ رُؤيايَ تَصدُقُ",
            "لكل شيءٍ إذا ما تم نقصانُ فلا يُغرُّ بطيب العيش إنسانُ هي الأيامُ كما شاهدتها دُولٌ مَن سَرَّهُ زَمنٌ ساءَتهُ أزمانُ",
            "وكنتُ أَرَى من وَجْهِ مَيّةَ لَمحةً فأَبْرَقُ مَغشيًّا علـيَّ مكانيـا وأَسمــعُ منــها نَبـأةً فكأنّـما أَصابَ بها",
            "سَهمٌ طَريرٌ فؤاديا وَأَنصِبُ وَجهي نَحوَ مَكَّةَ بِالضُحى إِذا كانَ مِن فَرطِ اللَيالي بَدا ليا أُصلّي فما ",
            "مرحبا بالجميع في هذا المكان الجميل",
            "أدري إذا ما ذكرتُها أثنتينِ صلّيتُ الضُّحى أم ثمانيا"]

In [None]:
from transformers import AutoTokenizer, AutoModel, BertModel
import torch

In [None]:

model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence, max_length=128, truncation=True,
                                       padding='max_length', return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
tokens['input_ids'].shape

In [None]:
outputs = model(**tokens)

In [None]:
embeddings = outputs.last_hidden_state
attention_mask = tokens['attention_mask']
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask
summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / summed_mask

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
res = cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
    )

In [None]:
res

array([[0.83476675, 0.83850056, 0.8610432 , 0.83421755, 0.7608243 ]],
      dtype=float32)

# Link words with figures

In [35]:
import nltk

In [None]:
nltk.download('punkt')

In [37]:
from nltk import word_tokenize
from nltk.stem.isri import ISRIStemmer
import glob

In [None]:

st = ISRIStemmer()
#word_list = "ما بالُ عَينِكَ مِنها الماءُ يَنسَكِبُ كَأَنَّهُ مِن كُلى مَفرِيَّة سَرِبُ  "
word_list = input()

def filter(word_list):
    wordsfilter=[]
    for a in word_tokenize(word_list):
        stem = st.stem(a)
        wordsfilter.append(stem)
    return wordsfilter


mm =filter(word_list)



 
images = [os.path.split(file)[1][:-4] for file in glob.glob('image/*.jpg')]
gg = []
for l in mm:
    if l in images:
        gg.append('image\\'+l+'.jpg')
        
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
for i in gg:
    img = mpimg.imread(i)
    imgplot = plt.imshow(img)
    plt.show()
    print(os.path.split(i)[1][:-4])