In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import importlib
import json
import pandas as pd
import re
import unicodedata
import traceback
import os
import sys

from pathlib import Path
from tqdm import tqdm_notebook as tqdm

pd.set_option("display.max_colwidth", 300)

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [3]:
data_dir = Path('/data/sentiment140/')

In [4]:
train_data_path = data_dir / "training.1600000.processed.noemoticon.csv"

In [5]:
df = pd.read_csv(train_data_path.__str__(), header=None, usecols=[0, 5], encoding='latin1')
df.columns = ['polarity', 'text']
df[df["polarity"] == 0].head()
df[df["polarity"] == 4].head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


Unnamed: 0,polarity,text
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! Cant wait!! - GIRL TALK!!
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart."
800003,4,"Being sick can be really cheap when it hurts too much to eat real food Plus, your friends make you soup"
800004,4,@LovesBrooklyn2 he has that effect on everyone


In [6]:
df.polarity.value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

In [7]:
CRLF_RE = re.compile(r'[\s\u3000]+')
RETWEET_RE = re.compile(r'^[rR][tT]')
HASHING_RE = re.compile(r'#[^\s]+')
MENTION_RE = re.compile(r'@[a-zA-Z0-9_]+:?')
URL_RE = re.compile(r'(?:url\s*)?(?:https?://|\w*\.\w+\.\w+)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\)…,]|[\u4E00-\u9FD0]|[あ-ん]|[\u30A1-\u30F4]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
TWEETLINK_RE = re.compile(r't.co/[a-zA-Z0-9]+')
PIC_TWITTER_RE = re.compile(r'pic.twitter.com/.+')

def clean(text, lang=None, retweet=False):
    try:
        if retweet:
            text = RETWEET_RE.sub('', text)
        elif RETWEET_RE.match(text):
            return None

        text = unicodedata.normalize('NFKC', text)
        text = CRLF_RE.sub(' ', text)
        # text = HASHING_RE.sub('', text)
        text = text.replace('#', '')
        text = MENTION_RE.sub('', text)
        text = URL_RE.sub('', text)
        text = TWEETLINK_RE.sub('', text)
        text = PIC_TWITTER_RE.sub('', text)
        text = text.strip()
        if text:
            return text
        else:
            return None
    except Exception:
        return None

In [8]:
df["clean_text"] = df.apply(lambda row: clean(row.text), axis=1)

In [9]:
df.dropna(subset=["clean_text"], inplace=True)
df.drop_duplicates(subset=["clean_text"], inplace=True)

In [10]:
n_chars = sum(len(text) for text in df.clean_text.values)
n_chars

103821369

In [11]:
n_percentage_in_monthly_budget = 200000/110/(n_chars*20/1000000)

In [12]:
bugget_for_half_of_budget = (n_chars/2)*20/1000000*110
bugget_for_half_of_budget

114203.5059

In [13]:
limit = (n_chars/2)
limit

51910684.5

In [14]:
df.polarity.value_counts()

0    779677
4    769731
Name: polarity, dtype: int64

In [15]:
neg_df = df[df["polarity"] == 0][:400000]
pos_df = df[df["polarity"] == 4][:400000]
neg_df.head()
pos_df.head()

Unnamed: 0,polarity,text,clean_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D","- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.","no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


Unnamed: 0,polarity,text,clean_text
800000,4,I LOVE @Health4UandPets u guys r the best!!,I LOVE u guys r the best!!
800001,4,im meeting up with one of my besties tonight! Cant wait!! - GIRL TALK!!,im meeting up with one of my besties tonight! Cant wait!! - GIRL TALK!!
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart.","Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart."
800003,4,"Being sick can be really cheap when it hurts too much to eat real food Plus, your friends make you soup","Being sick can be really cheap when it hurts too much to eat real food Plus, your friends make you soup"
800004,4,@LovesBrooklyn2 he has that effect on everyone,he has that effect on everyone


In [16]:
neg_budget = sum(len(text) for text in neg_df.clean_text.values)*20/1000000*110
pos_budget = sum(len(text) for text in pos_df.clean_text.values)*20/1000000*110
budget = neg_budget + pos_budget
budget

117721.2432

In [17]:
import gctranslate as gct
importlib.reload(gct)

translator = gct.GCTranslate(cache_path='./gctcache.sqlite3')

<module 'gctranslate' from '/root/ELSA/dataset/sentiment140/gctranslate.py'>

In [18]:
src = "en"
tgt = "ar"

translated_neg_text = []
translated_pos_text = []

total_chars = 0

In [19]:
i = 0
pbar = tqdm(neg_df.clean_text.values)
for text in pbar:
    try:
        translated = translator.translate(text, tgt, src)
        translated_neg_text.append(translated)
    except:
        translated_neg_text.append(None)
        traceback.print_exc()
    total_chars += len(text)
    if i % 100 == 0:
        pbar.set_postfix(total_chars=str(total_chars))
    i += 1

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [20]:
len(translated_pos_text)

0

In [21]:
i = 0
pbar = tqdm(pos_df.clean_text.values)
for text in pbar:
    try:
        translated = translator.translate(text, tgt, src)
        translated_pos_text.append(translated)
    except:
        translated_pos_text.append(None)
        traceback.print_exc()
    total_chars += len(text)
    if i % 100 == 0:
        pbar.set_postfix(total_chars=str(total_chars))
    i += 1

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [22]:
len(translated_neg_text)

400000

In [23]:
sys.path.append("../../script/")
import word_generator as wg
importlib.reload(wg)

<module 'word_generator' from '../../script/word_generator.py'>

tokenize src and target tweets

In [24]:
def tokenize(texts, tokenizer):
    tokenized_texts = []
    for text in tqdm(texts):
        try:
            tokenized_texts.append([tokenizer.tokenize(text)])
        except KeyboardInterrupt:
            break
        except:
            tokenized_texts.append(None)
    return tokenized_texts

In [25]:
en_tokenizer = wg.get_default_tokenizer("en")
ar_tokenizer = wg.get_default_tokenizer("ar")

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/data/stanfordnlp_resources/ar_padt_models/ar_padt_tokenizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/data/stanfordnlp_resources/ar_padt_models/ar_padt_mwt_expander.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
Done loading processors!
---


In [26]:
tokenized_neg_text_en = tokenize(neg_df.clean_text.values, en_tokenizer)
tokenized_neg_text_ar = tokenize(translated_neg_text, ar_tokenizer)
tokenized_pos_text_en = tokenize(pos_df.clean_text.values, en_tokenizer)
tokenized_pos_text_ar = tokenize(translated_pos_text, ar_tokenizer)

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [27]:
len(tokenized_neg_text_en), len(tokenized_neg_text_ar), len(tokenized_pos_text_en), len(tokenized_pos_text_ar)

(400000, 400000, 400000, 400000)

In [28]:
label = [0] * len(tokenized_neg_text_en) + [1] * len(tokenized_neg_text_ar)
en = tokenized_neg_text_en + tokenized_pos_text_en
ar = tokenized_neg_text_ar + tokenized_pos_text_ar

en_ar_df = pd.DataFrame({"label": label, "en": en, "ar": ar})
en_ar_df.shape
en_ar_df.dropna(inplace=True)
en_ar_df.shape
en_ar_df[en_ar_df.label == 0].head()
en_ar_df[en_ar_df.label == 1].head()

(800000, 3)

(800000, 3)

Unnamed: 0,label,en,ar
0,0,"[[-, Awww, ,, that's, a, bummer, ., You, shoulda, got, David, Carr, of, Third, Day, to, do, it, ., ;D]]","[[-, awww, ،, هذا, ألمشكل, ه, ., يجب, أن, تحصل, على, ديفيد, كار, من, اليوم, الثالث, ل, القيام, ب, ذٰلك, ., ؛د]]"
1,0,"[[is, upset, that, he, can't, update, his, Facebook, by, texting, it, ..., and, might, cry, as, a, result, School, today, also, ., Blah, !]]","[[منزعج, لأن, ه, لا, يستطيع, تحديث, موقع, facebook, الخاص, ب, ه, عن, طريق, الرسائل, النصية, ., ., ., و, قد, يبكي, كنتيجة, ل, المدرسة, اليوم, أيضًا, ., بلا, ه, !]]"
2,0,"[[I, dived, many, times, for, the, ball, ., Managed, to, save, 50, %, The, rest, go, out, of, bounds]]","[[لقد, غطيت, عدة, مرات, ل, الكرة, ., تمكنت, من, توفير, 50, ٪, والباقي, والباقي, يخرج, عن, الحدود]]"
3,0,"[[my, whole, body, feels, itchy, and, like, its, on, fire]]","[[جسدي, كل, ه, يشعر, ب, الحكة, و, مثل, النار]]"
4,0,"[[no, ,, it's, not, behaving, at, all, ., i'm, mad, ., why, am, i, here, ?, because, I, can't, see, you, all, over, there, .]]","[[لا, ،, هذا, لا, يتصرف, على, الإطلاق, ., أن, نا, مجنون, ., ل, ماذا, أن, نا, هنا, ؟, لأنني, لأنني, لأنني, لا, أستطيع, رؤية, ك, هناك, .]]"


Unnamed: 0,label,en,ar
400000,1,"[[I, LOVE, u, guys, r, the, best, !, !]]","[[أحبك, يا, شباب, !]]"
400001,1,"[[im, meeting, up, with, one, of, my, besties, tonight, !, Cant, wait, !, !, -, GIRL, TALK, !, !]]","[[اجتمع, مع, أحد, أفضل, الأصدقاء, الليلة, !, لا, استطيع, الانتظار, !, !, -, كلام, فتاة, !, !]]"
400002,1,"[[Thanks, for, the, Twitter, add, ,, Sunisa, !, I, got, to, meet, you, once, at, a, HIN, show, here, in, the, DC, area, and, you, were, a, sweetheart, .]]","[[شكرًا, على, twitter, ،, sunisa, !, حصلت, على, مقابلة, ك, مرة, واحدة, في, معرض, hin, هنا, في, منطقة, العاصمة, وكنت, وكنت, حبيبة, .]]"
400003,1,"[[Being, sick, can, be, really, cheap, when, it, hurts, too, much, to, eat, real, food, Plus, ,, your, friends, make, you, soup]]","[[قد, يكون, مرضك, رخيصًا, جدًا, عندما, تؤلم, ك, أكثر, من, الطعام, الحقيقي, ،, ف, إن, أصدقاء, ك, يجعلك, حساءًا]]"
400004,1,"[[he, has, that, effect, on, everyone]]","[[لدي, ه, هذا, التأثير, على, الجميع]]"


In [29]:
en_ar_df.to_csv((data_dir / "sentiment140_en_ar.csv").__str__(), index=False)