In [2]:
import numpy as np
import pandas as pd
import pickle

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [4]:
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from gensim import corpora, models, similarities, matutils

In [5]:
import linguistica as lxa
from collections import Counter

In [6]:
ar_df = pickle.load(open("ar_df.pkl", "rb"))

In [27]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.dialectid import DialectIdentifier, label_to_region

In [45]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.tagger.default import DefaultTagger

In [153]:
def clean_ar_text(text):
    mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
    tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, 
                                       scheme='bwtok',
                                      split=True)
    
    norm_text = normalize_unicode(text)
    clean_text = norm_text.strip()
    simple_tokens = simple_word_tokenize(clean_text)
    tokens = tokenizer.tokenize(simple_tokens)
    tokens_clean = []
    for token in tokens:
        if "+" not in token:
            tokens_clean.append(token)
    return tokens_clean

In [131]:
ar_df[['text', 'tweet']].iloc[0]

text      كورونا  كوفيد    صحة  الرياض  تثقيف  د ولاء ا...
tweet    #كورونا #كوفيد_19 #صحة #الرياض #تثقيف #د_ولاء_...
Name: 117750, dtype: object

In [114]:
len(ar_df['tweet'].iloc[0])

121

In [120]:
# text = ar_df['text'][0].apply(clean_ar_text)

In [129]:
type(ar_df['text'].iloc[0])

str

In [138]:
token = normalize_unicode(ar_df['text'].iloc[0])

In [139]:
token.strip()

'كورونا  كوفيد    صحة  الرياض  تثقيف  د ولاء الرفاس  السعوديون يد واحدة ضد كورونا  السعودية  خليكم في البيت  خليك بالبيت'

In [142]:
tokens = simple_word_tokenize(token)

In [141]:
mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, 
                                       scheme='bwtok',
                                      split=True)

In [143]:
tokens = tokenizer.tokenize(tokens)

In [144]:
tokens

['كور',
 '+و',
 '+نا',
 'كوفيد',
 'صح',
 '+ة',
 'ال+',
 'رياض',
 'تثقيف',
 'د',
 'ولاء',
 'ال+',
 'رفاس',
 'ال+',
 'سعودي',
 '+ون',
 'يد',
 'واحد',
 '+ة',
 'ضد',
 'كور',
 '+و',
 '+نا',
 'ال+',
 'سعودي',
 '+ة',
 'خلي',
 '+كم',
 'في',
 'ال+',
 'بيت',
 'خليك',
 'ب+',
 'ال+',
 'بيت']

In [154]:
clean_ar_text(ar_df['text'].iloc[0])

['كور',
 'كوفيد',
 'صح',
 'رياض',
 'تثقيف',
 'د',
 'ولاء',
 'رفاس',
 'سعودي',
 'يد',
 'واحد',
 'ضد',
 'كور',
 'سعودي',
 'خلي',
 'في',
 'بيت',
 'خليك',
 'بيت']

In [None]:
len(text)

In [None]:
print(text)

In [115]:
print(ar_df['text'].iloc[0])

 كورونا  كوفيد    صحة  الرياض  تثقيف  د ولاء الرفاس  السعوديون يد واحدة ضد كورونا  السعودية  خليكم في البيت  خليك بالبيت


In [156]:
ar_df['tokens'] = ar_df['text'].apply(clean_ar_text)

KeyboardInterrupt: 

In [None]:
# check tokens ^ this cell run recently

In [None]:
# ar_df[['tweet', 'text', 'tokens']].head()

In [None]:
ar_df.lang.value_counts()

In [34]:
# def id_dialect(text):
#     DID = DialectIdentifier.pretrained()
#     predictions = DID.predict(text)
# #     region_id = label_to_region(DID.predict(text))
#     top_dialect = [p.top for p in predictions]
#     return top_dialect
    

In [None]:
# text = ar_df['clean_text'][0:5].apply(id_dialect)

In [None]:
def p_o_s(text):
    mled = MLEDisambiguator.pretrained()
    tagger = DefaultTagger(mled, 'pos')
    pos_tagged = tagger.tag(text.split())
    return zip(text, pos_tagged)

In [None]:
ar_df['POS'] = ar_df['tokens'].apply(p_o_s)

In [None]:
def nouns(word, pos):
    is_noun = lambda pos: pos[:1] == 'noun'
    all_nouns = [word for (word, pos) in text if is_noun(text)]
    return all_nouns

In [None]:
ar_df['nouns'] = ar_df['POS'].apply(lambda (word, pos): nouns(word, pos))

In [None]:
ar_df[['tokens','nouns', 'POS']].head()

In [157]:
pickle.dump(ar_df, open('ar_df.pkl', 'wb'))

In [87]:
from sklearn.feature_extraction import text


In [62]:
stopword_url = "/Users/jess/workspace/Metis_Projects/Project_5/camel_tools/arabic-stop-words/list.txt"

In [82]:
def get_stopwords(file_location):
    lines = []
    file = open(file_location)
    lines = file.read().split('\n')
        
    file.close()
    return lines

In [83]:
stopwords = get_stopwords(stopword_url)

In [84]:
stopwords[0:5]

['،', 'ء', 'ءَ', 'آ', 'آب']

In [88]:
# my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)
my_stop_words = set(text.ENGLISH_STOP_WORDS + stopwords + 'http' + 'https')

In [90]:
len(my_stop_words)

1069

In [None]:
ar_df['filtered'] = en_df['tokens'].apply(lambda tweet: [word for word in tweet if word not in my_stop_words])

In [100]:
vectorizer = TfidfVectorizer(min_df=2, 
                             stop_words=my_stop_words)
tfidf1 = vectorizer.fit_transform(ar_df['filtered'])

In [102]:
vocab = vectorizer.vocabulary_

In [103]:
len(vocab)

360907

In [94]:
idf = vectorizer.idf_

In [95]:
type(idf)

numpy.ndarray

In [96]:
idf[0:5]

array([14.34159288, 14.34159288, 13.42530215, 10.9403955 , 14.34159288])

In [104]:
ar_tfidf_df = pd.DataFrame(tfidf1.toarray(), columns=vectorizer.get_feature_names())

In [105]:
ar_tfidf_df.head()

Unnamed: 0,2021,24,30,aa,aaa,aaaa,aaaaaabbbbk,aaaaaoxxxxx,aaaagroup,aaaalmarri,...,제이홉,조건없이,종대야,주헌,준면이,지민,지민아고마워사랑해,초이스,축하해,행복해
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
pickle.dump(tfidf1, open("tfidf1_sparse.pkl", 'wb'))
pickle.dump(vectorizer, open('ar_vectorizer.pkl', 'wb'))

In [None]:
def get_ngrams(df):
    data_dict = dict()

    tokens = []
    for row in en_df.index.tolist():
        tokens += en_df.loc[row, 'filtered'].split()

    lxa_object = lxa.from_corpus(tokens)

    unigrams = lxa_object.word_unigram_counter()
    bigrams = lxa_object.word_bigram_counter()
    trigrams = lxa_object.word_trigram_counter()

    df_unigrams = pd.DataFrame.from_dict(unigrams, orient='index') \
    .reset_index()


    df_bigrams = pd.DataFrame.from_dict(bigrams, orient='index') \
    .reset_index()


    df_trigrams = pd.DataFrame.from_dict(trigrams, orient='index') \
    .reset_index()

    df_unigrams.columns = ['N-Gram','Frequency']
    df_bigrams.columns = ['N-Gram','Frequency']
    df_trigrams.columns = ['N-Gram', 'Frequency']
    
    df_bigrams[['word1', 'word2']] = df_bigrams['N-Gram'].apply(pd.Series)
    df_trigrams[['word1', 'word2', 'word3']] = df_trigrams['N-Gram'].apply(pd.Series)

    data_dict['Unigrams'] = df_unigrams
    data_dict['Bigrams'] = df_bigrams
    data_dict['Trigrams'] = df_trigrams
    
    return data_dict

In [None]:
def viz(chart_data, num_grams=30, ascending=False, font_scale=1, 
        chart_scale='poster', chart_size=15, save_to_file=True, title=""):
   
    chart_data = chart_data.head(num_grams)\
    .reset_index().sort_values(by='Frequency', ascending=ascending).copy()
    
    sns.set_context(chart_scale, font_scale=font_scale)
    
    sns_plot = sns.catplot(x="Frequency", y='N-Gram', data=chart_data, kind="bar",\
                           size=chart_size, palette="flare");
    plt.title(title)
    plt.show();
    if save_to_file is not False:
        sns_plot.savefig(table_name)
    

In [None]:
titles = ['Unigrams', 'Bigrams', 'Trigrams']

for title in titles:
    viz(data_dict[title].sort_values(by='Frequency', ascending=False), num_grams=10, \
       chart_scale='poster', chart_size=9, title=title)