In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [35]:
import importlib
import json
import pandas as pd
import re
import unicodedata
import traceback
import os
import sys

from pathlib import Path
from tqdm import tqdm_notebook as tqdm

pd.set_option("display.max_colwidth", 300)

In [3]:
data_dir = Path('/data/sentiment140/')

In [4]:
train_data_path = data_dir / "training.1600000.processed.noemoticon.csv"

In [5]:
df = pd.read_csv(train_data_path.__str__(), header=None, usecols=[0, 5], encoding='latin1')
df.columns = ['polarity', 'text']
df[df["polarity"] == 0].head()
df[df["polarity"] == 4].head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


Unnamed: 0,polarity,text
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! Cant wait!! - GIRL TALK!!
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart."
800003,4,"Being sick can be really cheap when it hurts too much to eat real food Plus, your friends make you soup"
800004,4,@LovesBrooklyn2 he has that effect on everyone


In [6]:
df.polarity.value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

In [7]:
CRLF_RE = re.compile(r'[\s\u3000]+')
RETWEET_RE = re.compile(r'^[rR][tT]')
HASHING_RE = re.compile(r'#[^\s]+')
MENTION_RE = re.compile(r'@[a-zA-Z0-9_]+:?')
URL_RE = re.compile(r'(?:url\s*)?(?:https?://|\w*\.\w+\.\w+)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\)…,]|[\u4E00-\u9FD0]|[あ-ん]|[\u30A1-\u30F4]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
TWEETLINK_RE = re.compile(r't.co/[a-zA-Z0-9]+')
PIC_TWITTER_RE = re.compile(r'pic.twitter.com/.+')

def clean(text, lang=None, retweet=False):
    try:
        if retweet:
            text = RETWEET_RE.sub('', text)
        elif RETWEET_RE.match(text):
            return None

        text = unicodedata.normalize('NFKC', text)
        text = CRLF_RE.sub(' ', text)
        # text = HASHING_RE.sub('', text)
        text = text.replace('#', '')
        text = MENTION_RE.sub('', text)
        text = URL_RE.sub('', text)
        text = TWEETLINK_RE.sub('', text)
        text = PIC_TWITTER_RE.sub('', text)
        text = text.strip()
        if text:
            return text
        else:
            return None
    except Exception:
        return None

In [8]:
df["clean_text"] = df.apply(lambda row: clean(row.text), axis=1)

In [9]:
df.dropna(subset=["clean_text"], inplace=True)
df.drop_duplicates(subset=["clean_text"], inplace=True)

In [10]:
n_chars = sum(len(text) for text in df.clean_text.values)
n_chars

103821369

In [11]:
n_percentage_in_monthly_budget = 200000/110/(n_chars*20/1000000)

In [12]:
bugget_for_half_of_budget = (n_chars/2)*20/1000000*110
bugget_for_half_of_budget

114203.5059

In [13]:
limit = (n_chars/2)
limit

51910684.5

In [14]:
df.polarity.value_counts()

0    779677
4    769731
Name: polarity, dtype: int64

In [15]:
neg_df = df[df["polarity"] == 0][:400000]
pos_df = df[df["polarity"] == 4][:400000]
neg_df.head()
pos_df.head()

Unnamed: 0,polarity,text,clean_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D","- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.","no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


Unnamed: 0,polarity,text,clean_text
800000,4,I LOVE @Health4UandPets u guys r the best!!,I LOVE u guys r the best!!
800001,4,im meeting up with one of my besties tonight! Cant wait!! - GIRL TALK!!,im meeting up with one of my besties tonight! Cant wait!! - GIRL TALK!!
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart.","Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart."
800003,4,"Being sick can be really cheap when it hurts too much to eat real food Plus, your friends make you soup","Being sick can be really cheap when it hurts too much to eat real food Plus, your friends make you soup"
800004,4,@LovesBrooklyn2 he has that effect on everyone,he has that effect on everyone


In [16]:
neg_budget = sum(len(text) for text in neg_df.clean_text.values)*20/1000000*110
pos_budget = sum(len(text) for text in pos_df.clean_text.values)*20/1000000*110
budget = neg_budget + pos_budget
budget

117721.2432

In [17]:
import gctranslate as gct
importlib.reload(gct)

translator = gct.GCTranslate(cache_path='./gctcache.sqlite3')

<module 'gctranslate' from '/root/ELSA/dataset/sentiment140/gctranslate.py'>

In [18]:
src = "en"
tgt = "ar"

translated_neg_text = []
translated_pos_text = []

total_chars = 0

In [19]:
i = 0
pbar = tqdm(neg_df.clean_text.values)
for text in pbar:
    try:
        translated = translator.translate(text, tgt, src)
        translated_neg_text.append(translated)
    except:
        translated_neg_text.append(None)
        traceback.print_exc()
    total_chars += len(text)
    if i % 100 == 0:
        pbar.set_postfix(total_chars=str(total_chars))
    i += 1

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [21]:
len(translated_pos_text)

400000

In [20]:
i = 0
pbar = tqdm(pos_df.clean_text.values)
for text in pbar:
    try:
        translated = translator.translate(text, tgt, src)
        translated_pos_text.append(translated)
    except:
        translated_pos_text.append(None)
        traceback.print_exc()
    total_chars += len(text)
    if i % 100 == 0:
        pbar.set_postfix(total_chars=str(total_chars))
    i += 1

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 379, in _make_request
    httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-20-49dbc3cda2c0>", line 5, in <module>
    translated = translator.translate(text, tgt, src)
  File "/root/ELSA/dataset/sentiment140/gctranslate.py", line 56, in translate
    translatedText = self.google_cloud_translate(inp, tgt, src, model)
  File "/root/ELSA/dataset/sentiment140/gctranslate.py", line 80, in google_cloud_translate
    inp, target_language=tgt, source_language=src, model=model)
  File "/usr/local/lib/python3.6/dist-packages/google/cloud/translate_v2/client.py", line 252, in translate
    response = self._connection.api_request(method="POST", path="", data=data)
  File "/usr/local/li




In [23]:
len(translated_neg_text)

400000

In [37]:
sys.path.append("../../script/")
from word_generator import get_default_tokenizer

tokenize src and target tweets

In [62]:
def tokenize(texts, tokenizer):
    tokenized_texts = []
    for text in tqdm(texts):
        try:
            tokenized_texts.append([tokenizer.tokenize(text)])
        except KeyboardInterrupt:
            break
        except:
            tokenized_texts.append(None)
    return tokenized_texts

In [59]:
en_tokenizer = get_default_tokenizer("en")
ar_tokenizer = get_default_tokenizer("ar")

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/data/stanfordnlp_resources/ar_padt_models/ar_padt_tokenizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/data/stanfordnlp_resources/ar_padt_models/ar_padt_mwt_expander.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
Done loading processors!
---


In [64]:
tokenized_neg_text_en = tokenize(neg_df.clean_text.values, en_tokenizer)
tokenized_neg_text_ar = tokenize(translated_neg_text, ar_tokenizer)
tokenized_pos_text_en = tokenize(pos_df.clean_text.values, en_tokenizer)
tokenized_pos_text_ar = tokenize(translated_pos_text, ar_tokenizer)

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [52]:
en_ar_df = pd.DataFrame({"en": tokenized_neg, "ar": tokenized_neg_ar})

In [53]:
en_ar_df.dropna(inplace=True)
en_ar_df.shape

(400000, 2)

In [54]:
en_ar_df

Unnamed: 0,en,ar
0,"[[-, Awww, ,, that's, a, bummer, ., You, shoulda, got, David, Carr, of, Third, Day, to, do, it, ., ;D]]","[[-, awww, ,, that's, a, bumme, r, ., you, shoulda, got, david, carr, of, third, day, to, do, it, ., ;d]]"
1,"[[is, upset, that, he, can't, update, his, Facebook, by, texting, it, ..., and, might, cry, as, a, result, School, today, also, ., Blah, !]]","[[is, u, pset, that, he, can't, update, his, facebook, by, texting, it, ., ., ., and, might, cry, as, a, r, esult, school, today, also, ., blah, !]]"
2,"[[I, dived, many, times, for, the, ball, ., Managed, to, save, 50, %, The, rest, go, out, of, bounds]]","[[i, dived, many, times, for, the, ball, ., ددددد, دد, to, save, 50, %, the, r, est, go, out, of, boun, d, s]]"
3,"[[my, whole, body, feels, itchy, and, like, its, on, fire]]","[[my, whole, body, f, eels, itchy, and, like, its, on, fi, r, e]]"
4,"[[no, ,, it's, not, behaving, at, all, ., i'm, mad, ., why, am, i, here, ?, because, I, can't, see, you, all, over, there, .]]","[[no, ,, it's, not, behaving, at, all, ., دد, د, mad, ., why, am, i, h, er, e?, b, ecause, i, can't, s, e, e, you, all, over, ther, e, .]]"
5,"[[not, the, whole, crew]]","[[n, ot, the, whole, cr, e, w]]"
6,"[[Need, a, hug]]","[[need, a, hug]]"
7,"[[hey, long, time, no, see, !, Yes, .., Rains, a, bit, ,, only, a, bit, LOL, ,, I'm, fine, thanks, ,, how's, you, ?]]","[[hey, long, time, no, se, e, !, y, e, s, ., ., rains, a, bit, ,only, a, bit, lol, ,, i'm, fine, thanks, ,, how's, you, ?]]"
8,"[[nope, they, didn't, have, it]]","[[nope, they, didn't, have, i, t]]"
9,"[[que, me, muera, ?]]","[[que, me, muera, ?]]"
