In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import importlib
import json
import os
import pandas as pd
import sys

from pathlib import Path
from tqdm import tqdm_notebook as tqdm

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = "2"

In [4]:
data_dir = Path("/data/arabic_sa_tweets/")

In [5]:
ar_texts, labels  = [], [] 
for path in data_dir.glob("*.txt"):
    try:
        ar_texts.append(open(path).read().strip())
    except UnicodeDecodeError:
        ar_texts.append(open(path, encoding="latin1").read().strip())        
    if path.name.startswith("neg"):
        labels.append(0)
    else:
        labels.append(1)

In [6]:
for i, t in enumerate(ar_texts):
    if not t.strip():
        list(data_dir.glob("*.txt"))[i].__str__()

'/data/arabic_sa_tweets/negative234.txt'

'/data/arabic_sa_tweets/negative294.txt'

'/data/arabic_sa_tweets/negative352.txt'

'/data/arabic_sa_tweets/negative473.txt'

In [7]:
import gctranslate as gct
importlib.reload(gct)

translator = gct.GCTranslate(cache_path='./gctcache.sqlite3')

<module 'gctranslate' from '/root/ELSA/dataset/sentiment140/gctranslate.py'>

In [8]:
src = "ar"
tgt = "en"

en_texts = []

total_chars = 0

In [9]:
i = 0
pbar = tqdm(ar_texts)
for text in pbar:
    try:
        translated = translator.translate(text, tgt, src)
        en_texts.append(translated)
    except:
        en_texts.append(None)
        traceback.print_exc()
    total_chars += len(text)
    if i % 100 == 0:
        pbar.set_postfix(total_chars=str(total_chars))
    i += 1

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [10]:
sys.path.append("../../script/")
import word_generator as wg
importlib.reload(wg)

<module 'word_generator' from '../../script/word_generator.py'>

In [11]:
def tokenize(texts, tokenizer):
    tokenized_texts = []
    for text in tqdm(texts):
        try:
            tokenized_texts.append(json.dumps([tokenizer.tokenize(text)]))
        except KeyboardInterrupt:
            break
        except:
            tokenized_texts.append(None)
    return tokenized_texts

In [12]:
ar_tokenizer = wg.get_default_tokenizer("ar")
en_tokenizer = wg.get_default_tokenizer("en")

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/data/stanfordnlp_resources/ar_padt_models/ar_padt_tokenizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/data/stanfordnlp_resources/ar_padt_models/ar_padt_mwt_expander.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
Done loading processors!
---


In [13]:
ar_tokens = tokenize(ar_texts, ar_tokenizer)
en_tokens = tokenize(en_texts, en_tokenizer)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [14]:
ar_en_df = pd.DataFrame({"label": labels, "ar": ar_tokens, "en": en_tokens})

In [15]:
ar_en_df[ar_en_df.isnull().any(axis=1)]

Unnamed: 0,label,ar,en
1151,0,,[[]]
1217,0,,[[]]
1282,0,,[[]]
1416,0,,[[]]


In [16]:
ar_en_df = pd.DataFrame({"label": labels, "ar": ar_tokens, "en": en_tokens})
ar_en_df.shape
ar_en_df.dropna(inplace=True)
ar_en_df.shape
ar_en_df[ar_en_df.label == 0].head()
ar_en_df[ar_en_df.label == 1].head()

(2000, 3)

(1996, 3)

Unnamed: 0,label,ar,en
1000,0,"[[""\ufeff\u0648"", ""\u0627\u0644\u0644\u0647"", ...","[[""And"", ""Allah"", ""is"", ""haraam"", ""and"", ""Alla..."
1001,0,"[[""\u062f\u0627\u0644\u0628\u0646\u0632\u064a\...","[[""Gasoline"", ""comes"", ""from"", ""Saudi"", ""Arabi..."
1002,0,"[[""\u0627\u0644\u0644\u0647"", ""\u064a\u0648\u0...","[[""God"", ""takes"", ""you""]]"
1003,0,"[[""\u0634\u0648"", ""\u0647\u0627\u062f"", ""\u062...","[[""Shoo"", ""Had"", ""a"", ""reasonable"", ""program"",..."
1004,0,"[[""\ufeff\u0644\u0644\u0623\u0633\u0641"", ""\u0...","[[""Unfortunately"", "","", ""some"", ""believe"", ""th..."


Unnamed: 0,label,ar,en
0,1,"[[""\ufeff\u062d\u0642\u0627""]]","[[""Really""]]"
1,1,"[[""\u0635\u062d"", ""\u062c\u062f\u0627""]]","[[""Very"", ""true""]]"
2,1,"[[""\ufeff\u0627\u0647"", ""\u0648"", ""\u0627\u064...","[[""Oh"", ""and"", ""grace""]]"
3,1,"[[""\u0643\u0644\u0627\u0645\u0643"", ""\u062c\u0...","[[""Your"", ""words"", ""are"", ""beautiful"", ""and"", ..."
4,1,"[[""\ufeff\u0648"", ""\u062e\u064a\u0631"", ""\u062...","[[""And"", ""the"", ""best"", ""speech"", ""is"", ""what""..."


In [17]:
ar_en_df.to_csv((data_dir / "uci_arabic_sa_tweets.csv").__str__(), index=False)