# Imports

In [112]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt

In [41]:
import pyarabic.araby as araby

before_filter="اللَّهمَّ اغْفِرْ لنَا ولوالدِينَا"
after_filter = araby.strip_diacritics(before_filter)

print(after_filter)
# will print : اللهم اغفر لنا ولوالدينا

In [29]:
import pickle
def save_obj(obj, name):
    pickle.dump(obj,open(name + '.pkl', 'wb'), protocol=4)
    
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Load data

In [2]:
ds_rel_names = pd.read_csv('ar_rel_names.csv',';')
ds_rel_names

In [3]:
ds_words = pd.read_csv('ar_words.csv', ';')
ds_words.head()

In [4]:
ds_relations = pd.read_csv('ar_relations.csv', ';')
ds_relations.head()

# Get related words

In [5]:
dic_words = {i:w for i, w in zip(ds_words.id.values, ds_words.text.values)}

In [7]:
dic_rels = {i:r for i, r in zip(ds_rel_names.id.values, ds_rel_names.name.values)}

In [8]:
ds_relations['start_word'] = [dic_words[i] if i in dic_words.keys() else '' for i in ds_relations.start_id.values]
ds_relations['end_word'] = [dic_words[i] if i in dic_words.keys() else '' for i in ds_relations.end_id.values]
ds_relations['relation'] = [dic_rels[i] for i in ds_relations.relation_id.values]
ds_relations.head()

In [10]:
ds_relations.shape

In [11]:
ds_relations[ds_relations.end_word != ''].shape

In [12]:
ds_word_rel_clean = ds_relations[ds_relations.end_word != ''][['start_word', 'end_word', 'relation']]
ds_word_rel_clean

In [19]:
ds_word_rel_clean.to_csv('arabic_word_relations.csv', ';', index=False, encoding='utf8')

In [13]:

Counter(ds_word_rel_clean.relation.values)

# Build classifier

## Compress classes

In [15]:
minor_cls = ['has_subevent', 'desires', 'causes_desire', 'not_desires', 'motivated_by_goal', 'has_first_subevent',
             'symbol_of', 'is_a', 'distinct_from', 'similar_to', 'has_property', 'external_url']
new_relations = [r if r not in minor_cls else 'minor' for r in ds_word_rel_clean.relation.values]
new_relations

In [23]:
Counter(new_relations)

## Vectorize

In [17]:
import fasttext
import fasttext.util

# download an english model
model = fasttext.load_model('cc.ar.300.bin')

In [19]:
X = [np.concatenate((model.get_word_vector(sw), model.get_word_vector(ew))) for sw, ew in zip(ds_word_rel_clean.start_word.values, ds_word_rel_clean.end_word.values)]

In [20]:
X[0].shape

In [21]:
y = new_relations

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Balance Dataset

In [26]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 

In [27]:
print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
print()
print('Resampled dataset shape %s' % Counter(y_res))

## Build model

In [28]:
from catboost import CatBoostClassifier

In [29]:
cb_clf = CatBoostClassifier()
cb_clf.fit(X_res, y_res)

In [30]:
cb_clf.score(X_test, y_test)

In [31]:
from sklearn.metrics import f1_score
f1_score(y_test, cb_clf.predict(X_test), average='weighted')

In [24]:
from sklearn.metrics import confusion_matrix

In [32]:
y_pred = cb_clf.predict(X_test)

In [37]:
confusion_matrix(y_test, y_pred, labels=list(set(y_test)))

In [38]:
list(set(y_test))

In [39]:
Counter(y_test)

In [43]:
from sklearn.metrics import classification_report
target_names = list(set(y_test)).sort()
print(classification_report(y_test, y_pred, target_names=target_names))

# Word distances

In [21]:
import fasttext
import fasttext.util

# download an english model
model = fasttext.load_model('cc.ar.300.bin')

In [44]:
from scipy.spatial import distance


In [46]:
embedding_1 = [1, 0, 1]
embedding_2 = [0, 1, 0]

In [47]:
distance.cosine(embedding_1, embedding_2)

In [48]:
distance.euclidean(embedding_1, embedding_2)

In [49]:
cos_dists = [distance.cosine(model.get_sentence_vector(sw), model.get_sentence_vector(ew)) for sw, ew in zip(tqdm(ds_word_rel_clean.start_word.values), ds_word_rel_clean.end_word.values)]
ds_word_rel_clean['cos_dist'] = cos_dists

In [50]:
ds_word_rel_clean.describe()

In [51]:
ds_word_rel_clean.groupby('relation').mean()

In [52]:
ds_word_rel_clean.groupby('relation').std()

In [62]:
ds_word_rel_clean[ds_word_rel_clean.relation=='synonym'].cos_dist.hist()

In [63]:
ds_word_rel_clean[ds_word_rel_clean.relation=='antonym'].cos_dist.hist()

In [84]:
def get_relations(d):
    #if d<= 0.238098 + 0.376707:
    if d<= 0.1:
        return 'synonym'
    #elif (d <= 0.779119 + 0.182528) and (d >= 0.779119 - 0.182528):
    elif (d <= 1.2) and (d >= 0.9):
        return 'antonym'
    else:
        return 'unknown'

## Test statistical approach

In [54]:
ds_word_rel_clean

In [85]:
true_values = [r if r in ['antonym','synonym'] else 'unknown' for r in tqdm(ds_word_rel_clean.relation.values)]
predicts = [get_relations(d) for d in tqdm(ds_word_rel_clean.cos_dist.values)]

In [86]:
sum(np.array(true_values) == np.array(predicts))

In [87]:
target_names = list(set(true_values)).sort()
print(classification_report(true_values, predicts, target_names=target_names))

# Voting

In [93]:
y_pred_3val = [r[0] if r in ['antonym','synonym'] else 'unknown' for r in tqdm(list(y_pred))]
y_pred_3val

In [96]:
len(y_pred_3val)

In [98]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(ds_word_rel_clean.cos_dist.values, ds_word_rel_clean.relation.values, test_size=0.33, random_state=42)

In [99]:
predicts2 = [get_relations(d) for d in tqdm(X_test2)]

In [104]:
fltr = (np.array(predicts2) == np.array(y_test))&(np.array(y_pred_3val) == np.array(y_test))
sum(fltr)

In [110]:
Counter(np.array(predicts2)[fltr])

In [207]:
ds_antonyms = ds_word_rel_clean[ds_word_rel_clean.relation=='antonym']
ds_antonyms

# HLPL

In [1]:
from hlpl_arabic_words_synonym_antonym import arabic_words_synonym_antonym
hlpl_list_n,hlpl_list_v,conn=arabic_words_synonym_antonym.get()

In [2]:
hlpl_list_v[0][2].split(',')

## Assemble a dictionary

### Verbs

In [23]:
ant_syn_dic = []
for w in tqdm(hlpl_list_v):
    word = w[0].strip()
    print('word:', word)
    
    synonyms = [s.strip() for s in w[1].split(',')]
    print('synonyms:', synonyms)
    for syn in synonyms:
        if syn != '':
            ant_syn_dic.append([word, syn, 'synonym'])
            #print([word, syn, 'synonym'])
    
    antonyms = [a.strip() for a in w[2].split(',')]
    print('antonyms:', antonyms)
    for ant in antonyms:
        if ant != '':
            ant_syn_dic.append([word, ant, 'antonym'])
            #print([word, ant, 'antonym'])
        

### Nouns

In [24]:
for w in tqdm(hlpl_list_n):
    word = w[0].strip()
    print('word:', word)
    
    synonyms = [s.strip() for s in w[1].split(',')]
    print('synonyms:', synonyms)
    for syn in synonyms:
        if syn != '':
            ant_syn_dic.append([word, syn, 'synonym'])
            #print([word, syn, 'synonym'])
    
    antonyms = [a.strip() for a in w[2].split(',')]
    print('antonyms:', antonyms)
    for ant in antonyms:
        if ant != '':
            ant_syn_dic.append([word, ant, 'antonym'])
            #print([word, ant, 'antonym'])


In [None]:
ds_ant_syn_dic = pd.DataFrame(ant_syn_dic, columns=['start_word', 'end_word', 'relation'])

In [55]:
# clean the words of diacritics
ds_ant_syn_dic['start_word_cln'] = [araby.strip_diacritics(w) for w in tqdm(ds_ant_syn_dic.start_word.values)]
ds_ant_syn_dic['end_word_cln'] = [araby.strip_diacritics(w) for w in tqdm(ds_ant_syn_dic.end_word.values)]

# word lengths
ds_ant_syn_dic['sw_len'] = [len(w) for w in tqdm(ds_ant_syn_dic.start_word_cln.values)]
ds_ant_syn_dic['ew_len'] = [len(w) for w in tqdm(ds_ant_syn_dic.end_word_cln.values)]

In [56]:
ds_ant_syn_dic.head()

In [57]:
ds_ant_syn_dic.shape

In [59]:
ds_ant_syn_dic = ds_ant_syn_dic[(ds_ant_syn_dic.sw_len <= 12)&(ds_ant_syn_dic.ew_len <= 12)]
print(ds_ant_syn_dic.shape)
ds_ant_syn_dic.head()

## Stem the words

In [30]:
#TODO clean unneeded imports
import pandas as pd
import numpy as np
from nltk import word_tokenize
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#from sklearn.linear_model import LogisticRegression, LinearRegression, Lars, RidgeCV
#from sklearn.svm import SVC, SVR
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score

from tqdm import tnrange, tqdm_notebook
from time import sleep
import gc

from matplotlib import pyplot as plt
import os

from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
# def get_lang(lang_code):
#     ds=pd.read_csv('lang_codes_dic.csv',';')
#     try:
#         return ds[ds.Code==lang_code].iloc[0,1]
#     except:
#         return 'Undefined'


#vec encoding of words
def alpha_vec2(w, mx, max_word_len, dic):
    vec=np.zeros((max_word_len,len(dic)))    
    for i in range(0, min(len(w), max_word_len)):
        #print(i,w[i])
        if w[i] in dic:
            vec[i]=mx[dic.index(w[i])]
        
    vec=vec.astype('float16').flatten()
    vec[vec==np.inf]=0 
    vec[vec==-np.inf]=0        
    return vec


#ordinal encoding of words
def alpha_vec2ord(w, max_word_len):
    vec=np.zeros(max_word_len)    
    for i in range(0, len(w)):        
        vec[i]=ord(w[i])    
    return vec.astype('int')


#ordinal decoding of words
def decode_vec(vec):
    w=''.join([chr(int(v)) for v in vec if v!=0])    
    return w.strip()


def stemm_model():
    #load data
    #lex = pd.read_csv('DS_lemm/'+ dictionary_file,'\t', encoding='utf8', names=['Lemma','Word_form'], keep_default_na=False)
    #lang_code=dictionary_file[dictionary_file.index('-')+1:-4]
    lang='AR' #get_lang(lang_code)
    print('Language: ', lang)
    number_words = ds.shape[0]
    
    X_lex=ds['FORM_ar'].str.strip()
    X_lex=X_lex.values
    
    Y_lex=ds['ROOT_ar'].str.strip()
    Y_lex=Y_lex.values
    
    X_train, X_test, y_train, y_test = train_test_split(X_lex, Y_lex, test_size=0.1, random_state=42)
    
    #get max word length
    max_word_len=max(max([len(w) for w in Y_lex]),max([len(w) for w in X_lex]))
    
    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, analyzer='char')
    X = vectorizer.fit_transform(X_lex)
    dic=vectorizer.get_feature_names()#letter dictionary
    num_letters=len(dic)
    mx=X.T.dot(X)#letter cooccurence matrix
    mx=mx.toarray()
    
    #Vectorize
    X_lex_vec_train=[alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    Y_lex_vec_train=[alpha_vec2ord(w, max_word_len) for w in y_train]
    
    X_lex_vec_test=[alpha_vec2(w, mx, max_word_len, dic) for w in X_test]
    Y_lex_vec_test=[alpha_vec2ord(w, max_word_len) for w in y_test]
    
    #Build model
    best_model=RandomForestClassifier(n_estimators=10, n_jobs=5, criterion='entropy', bootstrap=True, random_state=42)
    
    best_model.fit(X_lex_vec_train, Y_lex_vec_train)
    
    #Test
    predicts_test=best_model.predict(X_lex_vec_test)
    predicts_train=best_model.predict(X_lex_vec_train)
    test_acc=sum([sum(p==y)==max_word_len for p,y in zip(predicts_test, Y_lex_vec_test)])/len(predicts_test)
    train_acc=sum([sum(p==y)==max_word_len for p,y in zip(predicts_train, Y_lex_vec_train)])/len(predicts_train)
    
    #Return results
    return test_acc, train_acc, max_word_len, num_letters, number_words, lang, X_test, y_test, predicts_test, best_model

In [38]:
model = load_obj('ar_stem_model_RandForest')
mx = load_obj('ar_mx')
dic = load_obj('ar_dic')
max_word_len = 12

In [39]:
dic

In [60]:
#Vectorize
X_start_word = [alpha_vec2(w, mx, max_word_len, dic) for w in tqdm(ds_ant_syn_dic.start_word_cln.values)]
X_end_word = [alpha_vec2(w, mx, max_word_len, dic) for w in tqdm(ds_ant_syn_dic.end_word_cln.values)]

In [61]:
sw_root_vecs = model.predict(X_start_word)
ew_root_vecs = model.predict(X_end_word)

In [62]:
ds_ant_syn_dic['sw_root'] = [decode_vec(x) for x in tqdm(sw_root_vecs)]
ds_ant_syn_dic['ew_root'] = [decode_vec(x) for x in tqdm(ew_root_vecs)]

In [63]:
ds_ant_syn_dic.head()

In [71]:
# Check for root permutation occurences
def perm(w1, w2):
    if w1 == w2:
        return False
    set1 = set([c for c in w1])
    set2 = set([c for c in w2])
    return set1 == set2

perm('abs', 'baa')

In [72]:
ds_ant_syn_dic['root_perm'] = [perm(w1, w2) for w1, w2 in zip(tqdm(ds_ant_syn_dic.sw_root.values), ds_ant_syn_dic.ew_root.values)]

In [73]:
ds_ant_syn_dic['root_perm'].values.sum()

In [74]:
ds_ant_syn_dic[ds_ant_syn_dic['root_perm'] == True]

In [75]:
save_obj(ds_ant_syn_dic[ds_ant_syn_dic['root_perm'] == True], 'hlpl_root_perm_ant_syn')

In [80]:
save_obj(ds_ant_syn_dic, 'hlpl_root_perm_all')

In [76]:
ds_ant_syn_dic[ds_ant_syn_dic['root_perm'] == True].to_excel('hlpl_root_perm_ant_syn.xlsx')

In [79]:
Counter(ds_ant_syn_dic[ds_ant_syn_dic['root_perm'] == True].relation.values)

# Relations in root letter permutations

## Load data

In [111]:
df = pd.read_csv('Roots_permutations.csv')
df.head()

In [112]:
df = df[df['Valid_permutations'] != '[]']
df

In [113]:
perms = [rc.replace('[','').replace(']','').replace("'",'').split(',') for rc in df['Valid_permutations'].values]
perms

## Make pairs

In [114]:
word_pairs = []

for root, combs in zip(tqdm(df.ROOT_ar.values), perms):
    for c in combs:
        word_pairs.append([root, c])
        print([root, c])
    

In [115]:
len(word_pairs)

In [116]:
word_pairs_clean = list(set([(a[0], a[1]) for a in word_pairs]))
word_pairs_clean

## Vectorize pairs

In [117]:
vec_pairs = []
vec_dists = []
X2 = []
for p in word_pairs_clean:
    vec1 = model.get_word_vector(p[0])
    vec2 = model.get_word_vector(p[1])
    X2.append(np.concatenate((vec1, vec2)))
    vec_pairs.append([vec1, vec2])
    vec_dists.append(distance.cosine(vec1, vec2))


## Predict statistical

In [118]:
vec_dists[:10]

In [119]:
relations_def = [get_relations(d) for d in vec_dists]
relations_def

In [120]:
Counter(relations_def)

## Predict with ML

In [121]:
y2 = cb_clf.predict(X2)
y2

In [122]:
Counter([i[0] for i in y2])

In [123]:
res_ds = pd.DataFrame(word_pairs_clean, columns=['word1', 'word2'])
res_ds['relations_stat_appr'] = relations_def
res_ds['relations_ML_appr'] = [i[0] for i in y2]
res_ds

In [108]:
save_obj(res_ds, 'res_ds')

In [111]:
res_ds

In [112]:
translations = {word:trans for word, trans in zip(df.ROOT_ar.values, df.GLOSS.values)}

In [116]:
translations['زغر']

In [117]:
res_ds['w1_trans'] = [translations[w.strip()] for w in res_ds.word1.values]
res_ds['w2_trans'] = [translations[w.strip()] for w in res_ds.word2.values]
res_ds

In [118]:
res_ds.to_csv('res_ds.csv', index=False, encoding='utf8')

In [128]:
res_sample = res_ds.groupby('relations_ML_appr').apply(lambda s: s.sample(min(len(s), 10), random_state=2022))
res_sample

In [129]:
res_sample.to_excel("res_sample.xlsx") 

# Antonym and Synonym character distribution

In [120]:
shams = [0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
sum(shams)

## Ara Morph root permutations

In [121]:
char_dist_ds = pd.read_csv('ant_syn_char_dist.csv',';')
char_dist_ds

In [122]:
Counter(char_dist_ds.relations_ML_appr.values)

In [123]:
Counter(char_dist_ds.relations_stat_appr.values)

In [124]:
syns = list(char_dist_ds[char_dist_ds.relations_ML_appr == 'synonym'].word1.values) + list(char_dist_ds[char_dist_ds.relations_ML_appr == 'synonym'].word2.values)
syns = list(set([w.strip() for w in syns]))
syns

In [125]:
len(syns)

In [126]:
ants = list(char_dist_ds[char_dist_ds.relations_ML_appr == 'antonym'].word1.values) + list(char_dist_ds[char_dist_ds.relations_ML_appr == 'antonym'].word2.values)
ants = list(set([w.strip() for w in ants]))
ants

In [127]:
len(ants)

In [128]:
ar_abc = set([c for c in ''.join(ants+syns)])
ar_abc

In [129]:
len(ar_abc)

### Syns

In [130]:
vectorizer1 = CountVectorizer(analyzer='char', vocabulary=ar_abc)
X = vectorizer1.fit_transform(syns)
#len(vectorizer1.get_feature_names())


In [131]:
X.sum(axis=0).shape

In [132]:
X.sum(axis=0)

### Ants

In [133]:
vectorizer2 = CountVectorizer(analyzer='char', vocabulary=ar_abc)
X2 = vectorizer2.fit_transform(ants)
#len(vectorizer1.get_feature_names())


In [134]:
X2.sum(axis=0).shape

In [135]:
np.array(X2.sum(axis=0))[0]

In [136]:
' '.join([c for c in vectorizer1.get_feature_names()])

In [137]:
ar_letters = [c for c in vectorizer1.get_feature_names()]
ar_letters

In [138]:
#plt.scatter(np.array(X.sum(axis=0))[0], np.array(X2.sum(axis=0))[0], c=shams)
x = np.array(X.sum(axis=0))[0]
x = x / sum(x) * 100
y = np.array(X2.sum(axis=0))[0]
y = y / sum(y) *100
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(x, y, c=shams, cmap='rainbow')

for i, txt in enumerate(ar_letters):
    ax.annotate(txt, (x[i]+0.2, y[i]), fontsize=20)
    
plt.xlabel('Synonym', fontsize=18)
plt.ylabel('Antonym', fontsize=16)

## HLPL

In [93]:
syns2 = list(set(ds_ant_syn_dic[(ds_ant_syn_dic.root_perm == True)&(ds_ant_syn_dic.relation == 'synonym')].sw_root.values)) + \
        list(set(ds_ant_syn_dic[(ds_ant_syn_dic.root_perm == True)&(ds_ant_syn_dic.relation == 'synonym')].ew_root.values))
#syns2 = list(set([w.strip() for w in syns]))
syns2 = list(set(syns2))
syns2

In [94]:
len(syns2)

In [95]:
ants2 = list(set(ds_ant_syn_dic[(ds_ant_syn_dic.root_perm == True)&(ds_ant_syn_dic.relation == 'antonym')].sw_root.values)) + \
        list(set(ds_ant_syn_dic[(ds_ant_syn_dic.root_perm == True)&(ds_ant_syn_dic.relation == 'antonym')].ew_root.values))
ants2 = list(set(ants2))
ants2

In [96]:
len(ants2)

In [97]:
ar_abc2 = set([c for c in ''.join(ants2+syns2)])
ar_abc2

In [98]:
len(ar_abc2)

In [115]:
shams2 = [0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0]
sum(shams)

### Syns

In [105]:
vectorizer3 = CountVectorizer(analyzer='char', vocabulary=ar_abc2)
X3 = vectorizer3.fit_transform(syns2)



In [106]:
X3.sum(axis=0).shape

In [107]:
X3.sum(axis=0)

### Ants

In [108]:
vectorizer4 = CountVectorizer(analyzer='char', vocabulary=ar_abc2)
X4 = vectorizer4.fit_transform(ants2)
#len(vectorizer1.get_feature_names())


In [109]:
X4.sum(axis=0).shape

In [110]:
np.array(X4.sum(axis=0))[0]

In [119]:
x = np.array(X3.sum(axis=0))[0]
x = x / sum(x) * 100
y = np.array(X4.sum(axis=0))[0]
y = y / sum(y) * 100
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(x, y, c=shams2, cmap='rainbow')

for i, txt in enumerate(ar_abc2):
    ax.annotate(txt, (x[i]+0.2, y[i]), fontsize=20)
    
plt.xlabel('Synonym', fontsize=18)
plt.ylabel('Antonym', fontsize=16)