# T-DNA
https://github.com/shizhediao/T-DNA

## This notebook generates ngrams for training using the T-DNA architecture


In [1]:
import torch
torch.__version__

'1.10.2+cu102'

In [17]:
import pandas as pd
import numpy as np
from collections import defaultdict
import fasttext
import spacy
import json
import csv
import matplotlib.pyplot as plt
import vocabulary_utils as vu

import importlib
importlib.reload(vu)

<module 'vocabulary_utils' from '/Users/kylehamilton/MyDocuments/ML-Labs/kinzen/projects/TAPT-n/vocabulary_utils.py'>

## Import vocabulary from xlm-roberta
We use this to remove ngrams which are already in the vocabulary.

If you want to use this technique with a different model, just make sure to fetch that model's vocabulary

In [18]:
with open('../models/xlm-roberta-large/vocab.json') as json_file:
    data = json.load(json_file)
    print("Vocabulary size:",len(data))

Vocabulary size: 250000


In [19]:
# xlm-roberta uses sentencepiece bpe which uses undersacores for spaces to play nice with multiple languages.
# in order to match words in the xlm-roberta vocab, we first get rid of the underscores.
xlm_roberta_vocab = [s.replace('▁','') for s in list(data.keys())]

In [20]:
xlm_roberta_vocab[:10]

['<unk>', '<s>', '</s>', ',', '.', '', 's', 'de', '-', 'a']

In [21]:
# a couple of checks for good measure...
print('vaccine' in xlm_roberta_vocab)
print('vac' in xlm_roberta_vocab)
print('cine' in xlm_roberta_vocab)

False
True
True


In [23]:
print('France' in xlm_roberta_vocab)
print('france' in xlm_roberta_vocab)

True
False


In [26]:
transcripts_metadata = pd.read_csv('../data/all-transcripts/transcripts.tsv',sep="\t")
transcripts_metadata

Unnamed: 0,count,path,model,use_stopwords,language,stopwords,additionalInfo
0,1171,audio/audio_ar-SA.txt,camel_tools,1,arabic,ar-stop-words.txt,https://towardsdatascience.com/arabic-nlp-uniq...
1,8926,audio/audio_de-DE.txt,de_core_news_sm,0,german,stopwords-de.txt,
2,34073,audio/audio_en-US.txt,en_core_web_sm,0,english,stopwords-en.txt,
3,129,audio/audio_es-ES.txt,es_core_news_sm,0,spanish,stopwords-es.txt,
4,2690,audio/audio_es-MX.txt,es_core_news_sm,0,spanish,stopwords-es.txt,
5,672,audio/audio_fr-FR.txt,fr_core_news_sm,0,french,stopwords-fr.txt,
6,1050,audio/audio_hi-IN.txt,en_core_web_sm,1,hindi,hi-stop-words.txt,
7,1240,audio/audio_pt-BR.txt,pt_core_news_sm,0,portuguese,stopwords-pt.txt,
8,1250,audio/audio_ru-RU.txt,ru_core_news_sm,0,russian,stopwords-ru.txt,
9,1585,audio/audio_sv-SE.txt,xx_sent_ud_sm,1,swedish,sv-stop-words.txt,


In [27]:
languages = list(set(transcripts_metadata['language']))
languages

['german',
 'hindi',
 'russian',
 'chinese',
 'english',
 'spanish',
 'french',
 'arabic',
 'swedish',
 'portuguese',
 'turkish']

In [28]:
models = list(set(transcripts_metadata['model']))
models

['fr_core_news_sm',
 'xx_sent_ud_sm',
 'zh_core_web_sm',
 'pt_core_news_sm',
 'de_core_news_sm',
 'ru_core_news_sm',
 'camel_tools',
 'es_core_news_sm',
 'en_core_web_sm']

# Arabic ngrams
Arabic is special and has its own tokenization libraries. See this blogpost: https://towardsdatascience.com/arabic-nlp-unique-challenges-and-their-solutions-d99e8a87893d

In [99]:
!pip install camel_tools

Collecting camel_tools
  Downloading camel_tools-1.2.0.tar.gz (58 kB)
     |████████████████████████████████| 58 kB 1.8 MB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting cachetools
  Downloading cachetools-4.2.4-py3-none-any.whl (10 kB)
Collecting torch>=1.3
  Downloading torch-1.10.2-cp36-cp36m-manylinux1_x86_64.whl (881.9 MB)
     |████████████▋                   | 347.1 MB 156.2 MB/s eta 0:00:04

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



     |██████████████████████████▎     | 725.1 MB 167.0 MB/s eta 0:00:01

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



     |███████████████████████████████ | 854.7 MB 167.3 MB/s eta 0:00:01

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



     |████████████████████████████████| 881.9 MB 7.9 kB/s              
[?25hCollecting transformers>=3.0.2
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
     |████████████████████████████████| 3.8 MB 82.4 MB/s            
[?25hCollecting editdistance
  Downloading editdistance-0.6.0-cp36-cp36m-manylinux2010_x86_64.whl (284 kB)
     |████████████████████████████████| 284 kB 144.8 MB/s            
Collecting camel-kenlm
  Downloading camel-kenlm-2021.12.27.tar.gz (418 kB)
     |████████████████████████████████| 418 kB 144.0 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
     |████████████████████████████████| 67 kB 11.6 MB/s             
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
     |████████████████████████████████| 895 kB 25.3 MB/s            
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers

In [96]:
path_to_data = "../data/transcripts/audio_ar-SA.txt"
ar_df = pd.read_csv(path_to_data,header=None,names=['text'])
ar_df['text'][0][:100]

'"عناوين اليوم أول زفاف في ألم تبرز طيران الإمارات تتوقع زيادة عدد مسافريها إلى واحد فصل 01,000,000. '

In [100]:
# import the dediacritization tool
from camel_tools.utils.dediac import dediac_ar
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.tokenizers.word import simple_word_tokenize

# apply to your text column
ar_df['text'] = ar_df['text'].apply(dediac_ar)

def ortho_normalize(text):
    text = normalize_alef_maksura_ar(text)
    text = normalize_alef_ar(text)
    text = normalize_teh_marbuta_ar(text)
    return text
  
ar_df['text'] = ar_df['text'].apply(ortho_normalize)
ar_df['text'] = ar_df['text'].apply(simple_word_tokenize)



ar_df.to_csv('../data/transcripts/transcripts-ar-pretokenized.csv',header=None,index=None)

In [101]:
for d in ar_df['text'][0][:10]:
    print(d)

"
عناوين
اليوم
اول
زفاف
في
الم
تبرز
طيران
الامارات


In [203]:
!camel light

In [203]:
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer

db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

analyses = analyzer.analyze('فصل')

print(analyses[0])

{'diac': 'فَصَلَ', 'lex': 'فَصَل', 'bw': 'فَصَل/PV+َ/PVSUFF_SUBJ:3MS', 'gloss': 'separate;detach;set_apart+he;it_<verb>', 'pos': 'verb', 'prc3': '0', 'prc2': '0', 'prc1': '0', 'prc0': '0', 'per': '3', 'asp': 'p', 'vox': 'a', 'mod': 'i', 'stt': 'na', 'cas': 'na', 'enc0': '0', 'rat': 'n', 'source': 'lex', 'form_gen': 'm', 'form_num': 's', 'd3seg': 'فَصَلَ', 'caphi': 'f_a_s._a_l_a', 'd1tok': 'فَصَلَ', 'd2tok': 'فَصَلَ', 'pos_logprob': -1.023208, 'd3tok': 'فَصَلَ', 'd2seg': 'فَصَلَ', 'pos_lex_logprob': -4.497461, 'num': 's', 'ud': 'VERB', 'gen': 'm', 'catib6': 'VRB', 'root': 'ف.ص.ل', 'bwtok': 'فَصَل_+َ', 'pattern': '1َ2َ3َ', 'lex_logprob': -4.497461, 'atbtok': 'فَصَلَ', 'atbseg': 'فَصَلَ', 'd1seg': 'فَصَلَ', 'stem': 'فَصَل', 'stemgloss': 'separate;detach;set_apart', 'stemcat': 'PV'}


## Chinese
Chinese is also special since it causes some funny runtime errors. We are omitting it in phase 1.

In [None]:
# pretokenize chinese
def pretokenize(text):
    nlp = spacy.load('zh_core_web_sm')
    _text = nlp(str(text))
    tokens=[]
    for token in _text: 
        if not token.is_punct and not token.is_stop:
            tokens.append(token.text)
    return tokens

chinese['text'] = chinese['text'].apply(pretokenize)

In [None]:
chinese.to_csv('../data/transcripts/transcripts-zh-pretokenized.csv',header=None,index=None)

# Multilingual ngrams

In [34]:
importlib.reload(vu)

# get ngrams for all language except arabic, and chinese
all_transcripts = transcripts_metadata[(transcripts_metadata['language']!='arabic') & (transcripts_metadata['language']!='chinese')]

In [35]:
all_transcripts = all_transcripts.sort_values(by="language")

In [36]:
all_transcripts.fillna('', inplace=True)

In [37]:
all_transcripts

Unnamed: 0,count,path,model,use_stopwords,language,stopwords,additionalInfo
19,10,video/video_us.txt,en_core_web_sm,0,english,stopwords-en.txt,
2,34073,audio/audio_en-US.txt,en_core_web_sm,0,english,stopwords-en.txt,
14,1,video/video_kz.txt,en_core_web_sm,0,english,stopwords-en.txt,
12,999,video/video_en-US.txt,en_core_web_sm,0,english,stopwords-en.txt,
18,5,video/video_ua.txt,en_core_web_sm,0,english,stopwords-en.txt,
13,396,video/video_fr-FR.txt,fr_core_news_sm,0,french,stopwords-fr.txt,
5,672,audio/audio_fr-FR.txt,fr_core_news_sm,0,french,stopwords-fr.txt,
11,34244,video/video_de-DE.txt,de_core_news_sm,0,german,stopwords-de.txt,
1,8926,audio/audio_de-DE.txt,de_core_news_sm,0,german,stopwords-de.txt,
6,1050,audio/audio_hi-IN.txt,en_core_web_sm,1,hindi,hi-stop-words.txt,


## Concatenate the audio and video transcripts for each language

In [42]:
french = pd.read_csv('../data/all-transcripts/transcripts-all-fr.csv',names=['text'])
german = pd.read_csv('../data/all-transcripts/transcripts-all-de.csv',names=['text'])
hindi = pd.read_csv('../data/all-transcripts/transcripts-all-hi.csv',names=['text'])
portuguese = pd.read_csv('../data/all-transcripts/transcripts-all-pt.csv',names=['text'])
russian = pd.read_csv('../data/all-transcripts/transcripts-all-ru.csv',names=['text'])
spanish = pd.read_csv('../data/all-transcripts/transcripts-all-es.csv',names=['text'])
swedish = pd.read_csv('../data/all-transcripts/transcripts-all-sv.csv',names=['text'])
turkish = pd.read_csv('../data/all-transcripts/transcripts-all-tr.csv',names=['text'])
# chinese = pd.read_csv('../data/all-transcripts/transcripts-zh-pretokenized.csv',names=['text'])
# arabic = pd.read_csv('../data/all-transcripts/transcripts-ar-pretokenized.csv',names=['text'])

## Create an iterable for generating the ngrams

In [None]:
langs = [(french,'fr','fr_core_news_sm',None),
         (german,'de','de_core_news_sm',None),
         (hindi,'hi','en_core_web_sm','hi-stop-words.txt'),
         (portuguese,'pt','pt_core_news_sm',None),
         (russian,'ru','ru_core_news_sm',None),
         (spanish,'es','es_core_news_sm',None),
         (swedish,'sv','xx_sent_ud_sm','sv-stop-words.txt'),
         (turkish,'tr','en_core_web_sm','tr-stop-words.txt')
         ]
# (chinese,'zh','zh_core_web_sm','stopwords-zh.txt'),
# (arabic,'ar','None','ar-stop-words.txt')

In [58]:
!head -n 10 ../data/transcripts/transcripts-all-hi.csv > ../data/transcripts/sample.csv

In [45]:
# sanity check
import vocabulary_utils as vu
importlib.reload(vu)
ua = pd.read_csv('../data/all-transcripts/sample-fr.csv',header=None,names=['text'])
combined_list,_ = vu.getNgramsSpacy(ua['text'],spacy_model='fr_core_news_sm',stopwords=None,max_n=15,LLMvocab=xlm_roberta_vocab,pmi=True)
combined_list

Counting ngrams: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]
Calculating TFIDF: 100%|██████████| 3433/3433 [00:04<00:00, 849.00it/s]
Calculating PMI: 100%|██████████| 8612/8612 [00:10<00:00, 840.57it/s] 


[('Ok Google', 9.154510487015513),
 ('facilite épreuve', 9.154510487015513),
 ('habiller conséquence', 9.154510487015513),
 ('conséquence tendances', 9.154510487015513),
 ('attaquer compotes', 9.154510487015513),
 ('COS pi', 9.154510487015513),
 ('pi rationalisme', 9.154510487015513),
 ('rationalisme corollaire', 9.154510487015513),
 ('commission chargée', 9.154510487015513),
 ('pape organise', 9.154510487015513),
 ('bravo Bravo', 9.154510487015513),
 ('Bravo multiplicité', 9.154510487015513),
 ('horizon confondu', 9.154510487015513),
 ('confondu témoignant', 9.154510487015513),
 ('Douleur aveux', 9.154510487015513)]

Counting ngrams: 100%|██████████| 1/1 [00:06<00:00,  6.28s/it]
Calculating TFIDF: 100%|██████████| 3433/3433 [00:04<00:00, 839.64it/s] 
Calculating PMI: 100%|██████████| 8612/8612 [00:09<00:00, 893.44it/s] 


[('Ok Google', 9.154510487015513),
 ('facilite épreuve', 9.154510487015513),
 ('habiller conséquence', 9.154510487015513),
 ('conséquence tendances', 9.154510487015513),
 ('attaquer compotes', 9.154510487015513),
 ('COS pi', 9.154510487015513),
 ('pi rationalisme', 9.154510487015513),
 ('rationalisme corollaire', 9.154510487015513),
 ('commission chargée', 9.154510487015513),
 ('pape organise', 9.154510487015513),
 ('bravo Bravo', 9.154510487015513),
 ('Bravo multiplicité', 9.154510487015513),
 ('horizon confondu', 9.154510487015513),
 ('confondu témoignant', 9.154510487015513),
 ('Douleur aveux', 9.154510487015513),
 ('descend singe', 9.154510487015513),
 ('Yapi relation', 9.154510487015513),
 ('relation Écritures', 9.154510487015513),
 ('sache déclarer', 9.154510487015513),
 ('embrigadement Toulouse', 9.154510487015513),
 ('valeur reperdre', 9.154510487015513),
 ('équivalent congé', 9.154510487015513),
 ('menteur mytho', 9.154510487015513),
 ('enjeu lampes', 9.154510487015513),
 ('ri

## Pointwise Mutual Information (PMI)
The TDNA paper uses PMI to determine which ngrams to include. 

$$
PMI(a,b) = log\frac{p(a,b)}{p(a)p(b)} = log\frac{p(a|b)}{p(a)} = log\frac{p(b|a)}{p(b)}
$$

For each sentence $X$ with tokens  $x_1, x_2 .... x_t$   
Find ngrams with a $PMI$ score above some threshold   
Generate lexicon $L$ where each ngram appears with a frequency of at least $f$   
For each token in $X$, check if a substring exists in $L$, If so, extract the ngram from $L$ to form $S$ with ngrams $s_1, s_2, ... s_k$   

### To use PMI, set pmi=True. Otherwise, the script will use simple frequencies

In [120]:
importlib.reload(vu)

t = tqdm(langs)
for lang in t:
    t.set_description("processing language: "+lang[1]+, refresh=True)
    data = lang[0]
    language = lang[1]
    model=lang[2]
    stopwords = lang[3]
    max_n = 32768
        
    ngrams_list = vu.getNgramsSpacy(data['text'],
                                     stopwords=stopwords,
                                     spacy_model=model,
                                     LLMvocab=xlm_roberta_vocab,
                                     language=language,
                                     max_n=max_n,
                                     pmi=True)

    with open('../data/ngrams/'+language+'_ngrams_'+str(max_n)+'.tsv', 'w') as f:
        for item in ngrams_list:
            f.write("%s\t%s\n" % (item[0], item[1]))

## Make embeddings
Before proceeding with the next steps, make sure to train fasttext models and generate the embeddings for the ngrams

TAPT-n/fasttext-train-multilingual.ipynb

In [14]:
languages = ['fr','de','es','hi','pt','ru','sv','tr','zh','ar']
languages[:-2]

['fr', 'de', 'es', 'hi', 'pt', 'ru', 'sv', 'tr']

In [16]:
# reduce the number of ngrams by using only ones with frequency > 100
# combine all languages for training (skip chinese and arabic for now)

languages = ['fr','de','es','hi','pt','ru','sv','tr','zh','ar']

all_ngrams = pd.DataFrame(columns=['ngram','count'])
all_embeddings = np.empty([0, 768])

for lang in languages[:-2]:
    df = pd.read_csv('../data/ngrams/'+lang+'_ngrams_32768.tsv',sep="\t",header=None,names=['ngram','count'])
    embeddings = np.load('../data/ngrams/'+lang+'_ngrams_32768.npy')
    
    size = len(df[df['count']>100])
    print(lang, size)
    short_ngrams = df[:size]
    short_embeddings = embeddings[:size]
    
    all_embeddings = np.concatenate((all_embeddings, short_embeddings))
    all_ngrams = pd.concat([all_ngrams,short_ngrams])
    
    # save to file
    short_ngrams.to_csv('../data/ngrams/short/'+lang+'_ngrams_'+str(size)+'.tsv',sep="\t",header=None,index=None)
    np.save('../data/ngrams/short/'+lang+'_ngrams_'+str(size)+'.npy',short_embeddings)

fr 2318
de 32768
es 6114
hi 2392
pt 11930
ru 3097
sv 3977
tr 8830


In [17]:
# add the english embeddings
df = pd.read_csv('../data/english_snippet_graph_matches_100k_ngrams_32768.tsv',sep="\t",header=None,names=['ngram','count'])
embeddings = np.load('../models/english_snippet_graph_matches_100k_fasttext_3.7.22_768.npy')

In [18]:
size = len(df[df['count']>5])

In [19]:
size

8772

In [20]:
len(all_embeddings)

71426

In [21]:
short_ngrams = df[:size]
short_embeddings = embeddings[:size]

In [22]:
all_embeddings = np.concatenate((all_embeddings, short_embeddings))
all_ngrams = pd.concat([all_ngrams,short_ngrams])

In [55]:
print(len(all_embeddings))
print(len(all_ngrams))

80198
80198


In [24]:
all_ngrams.to_csv('../data/ngrams/short/xlm_ngrams_768.tsv',sep="\t",header=None, index=None)
np.save('../data/ngrams/short/xlm_ngrams_768.npy',all_embeddings)

In [87]:
# concatenate and shuffle all the data (except chinese and arabic)
!cat ../data/all-transcripts/transcripts-all-de.csv \
../data/all-transcripts/transcripts-all-es.csv \
../data/all-transcripts/transcripts-all-fr.csv \
../data/all-transcripts/transcripts-all-hi.csv \
../data/all-transcripts/transcripts-all-pt.csv \
../data/all-transcripts/transcripts-all-ru.csv \
../data/all-transcripts/transcripts-all-sv.csv \
../data/all-transcripts/transcripts-all-tr.csv \
../data/english_snippet_graph_matches_100k.csv \
| shuf > ../data/all-transcripts/transcripts-all.csv

In [27]:
# take a sample of of the transcripts data
!head -n 200 ../data/all-transcripts/transcripts-all.csv > ../data/all-transcripts/sample.csv 

In [19]:
# If the wc of ngrams doesn't equal the length of the embeddings, it means some ngrams have the new line character. Then we can't use simple pandas csv write.
# Instead we remove the '\n' charcter and write to file the python way
### all_ngrams.to_csv('../data/ngrams/short/xlm_ngrams_768.tsv',sep="\t",header=None,index=None)

In [54]:
!wc -l ../data/ngrams/short/xlm_ngrams_768.tsv

80198 ../data/ngrams/short/xlm_ngrams_768.tsv


# preprocessing the data
* Remove the quotation marks at the start and end of each sentence
* Split into chunks so we can use linebyline transformers function

## tests on sample data

In [35]:
# first remove the first 3 quotations
!cut -c 4- ../data/transcripts/sample.csv > ../data/transcripts/sample2.csv

In [39]:
# then remove the last quotation by spliting on quotation mark - kind of a work-around
!cut -d '"' -f 1 ../data/transcripts/sample2.csv > ../data/transcripts/sample.csv

In [40]:
# let's see if it worked:
!head -n 2 ../data/transcripts/sample.csv

spielt die Wahrheit eine Rolle nicht zu Gruppen wie Black lies Mörder das ist aus vielen Gründen tragisch nicht zuletzt weil dadurch schwarz sie ihr Leben verlieren wenn es um das Thema amerikanische Polizei schwarze und die tödliche Anwendung von Gewalt geht wissen wir folgendes eine kürzlich von Louis James Forscherin an der Washington State University durchgeführte Studie über tödliche Gewalt ergab dass Polizeibeamte in simulierten Bedrohungsszenarien mit geringerer Wahrscheinlichkeit auf unbewaffnete schwarze Verdächtige schießen als auf unbewaffnete weiße oder Hispano Amerikaner der Harvard Wirtschaftsprofessor Ronald freier analysierte mehr als 1000 Erschießungen durch Polizeibeamte im ganzen Land er kam zu dem Schluss dass es bei Erschießungen durch die Polizei keinerlei Anzeichen für eine rassistische Voreingenommenheit gibt in Houston stellte er fest das schwarze mit 24% geringerer Wahrscheinlichkeit von Polizisten erschossen wurden als weiße obwohl die Verdächtigen bewaffnet 

In [52]:
# split into chunks:
# insert a line break every n charcters
# https://man.openbsd.org/fold
!fold -bs -w 3000 ../data/transcripts/sample.csv > sample_chunked.csv

In [53]:
!head -n 2 sample_chunked.csv

spielt die Wahrheit eine Rolle nicht zu Gruppen wie Black lies Mörder das ist aus vielen Gründen tragisch nicht zuletzt weil dadurch schwarz sie ihr Leben verlieren wenn es um das Thema amerikanische Polizei schwarze und die tödliche Anwendung von Gewalt geht wissen wir folgendes eine kürzlich von Louis James Forscherin an der Washington State University durchgeführte Studie über tödliche Gewalt ergab dass Polizeibeamte in simulierten Bedrohungsszenarien mit geringerer Wahrscheinlichkeit auf unbewaffnete schwarze Verdächtige schießen als auf unbewaffnete weiße oder Hispano Amerikaner der Harvard Wirtschaftsprofessor Ronald freier analysierte mehr als 1000 Erschießungen durch Polizeibeamte im ganzen Land er kam zu dem Schluss dass es bei Erschießungen durch die Polizei keinerlei Anzeichen für eine rassistische Voreingenommenheit gibt in Houston stellte er fest das schwarze mit 24% geringerer Wahrscheinlichkeit von Polizisten erschossen wurden als weiße obwohl die Verdächtigen bewaffnet 

## preprocess all the transcripts
Transformers Datasets actually has python code that will chunk the data, but... it reads the entire dataset into memory and then uses slices to chunk it. The advantage of using transformers is that it will chunk on tokens. But.. that's a pretty memory intensive task, and I'm betting on linux even though the chunks will someties need to be padded, and sometimes will be truncated.

The dataset to use for mlm will be: `../data/transcripts/transcripts-all-chunked.csv`

In [56]:
!cut -c 4- ../data/transcripts/transcripts-all.csv > ../data/transcripts/transcripts-all-4.csv

In [57]:
!cut -d '"' -f 1 ../data/transcripts/transcripts-all-4.csv > ../data/transcripts/transcripts-all.csv

In [11]:
!fold -bs -w 800 ../data/transcripts/transcripts-all.csv > ../data/transcripts/transcripts-all-chunked.csv

In [12]:
!head -n 5 ../data/transcripts/transcripts-all-chunked.csv

spielt die Wahrheit eine Rolle nicht zu Gruppen wie Black lies Mörder das ist aus vielen Gründen tragisch nicht zuletzt weil dadurch schwarz sie ihr Leben verlieren wenn es um das Thema amerikanische Polizei schwarze und die tödliche Anwendung von Gewalt geht wissen wir folgendes eine kürzlich von Louis James Forscherin an der Washington State University durchgeführte Studie über tödliche Gewalt ergab dass Polizeibeamte in simulierten Bedrohungsszenarien mit geringerer Wahrscheinlichkeit auf unbewaffnete schwarze Verdächtige schießen als auf unbewaffnete weiße oder Hispano Amerikaner der Harvard Wirtschaftsprofessor Ronald freier analysierte mehr als 1000 Erschießungen durch Polizeibeamte im ganzen Land er kam zu dem Schluss dass es bei Erschießungen durch die Polizei 
keinerlei Anzeichen für eine rassistische Voreingenommenheit gibt in Houston stellte er fest das schwarze mit 24% geringerer Wahrscheinlichkeit von Polizisten erschossen wurden als weiße obwohl die Verdächtigen bewaffnet

# To train use:
`bash train-mlm-xlm.sh &>> train_log.txt`

in a separate terminal:
`tail -f train_log.txt`

This way if you get logged out, or lose the session, you can always pick up the stdout

# English ngrams
The first experiments were done using ngrams generated from english_snippet_graph_matches_100k (and not the above transcripts files)

In [8]:
path_to_data = "data/english_snippet_graph_matches_100k.csv"
df100k = pd.read_csv(path_to_data)
# df100k.columns
print(len(df100k))
with pd.option_context('display.max_colwidth', None):
    display(df100k[:5]
            .style.set_properties(**{'text-align': 'left'})
            .set_table_styles([ dict(selector='th', props=[('text-align', 'left')])]))

19904


Unnamed: 0,snippet
0,"shots. The people are getting Now. Cover that you're okay. You're not going to, you're not going to get covid. You have these vaccinations, you're not going to get covid-19 have these vaccinations. Guess what? People vaccinated fully vaccinated. People fully vaccinated with boosters people fully vaccinated with boosters and even natural immunity. You're all getting arm across one of the Attorneys General. And there are a number of them around the country have been leading the effort to fight"
1,"It's it's insane. But you know, she wants to close my brother. She wants to close. My sister is so, finally this week. We shoot her. But this is the Deep state that my father's been talking about. For years. They weren't successful in taking down my father in Washington, d.c., Despite the fact that they tried over and over and over again. That's what did they do? They send it to their cronies in New York to try and take him down and it's disgusting. And honestly, I used to have a lot of faith in the legal system in this country. I have no faith in it anymore because of a prosecutor in the United States of America, shouldn't"
2,"going to be honest. When I saw the tape put together and I put one together previously now that I saw some of the other new available video that's come out. I was I was shocked to be honest with you, anything anymore. I'm speaking about politics for what 67 years of this point. And you know, we live to the Russia hoax right? Where the FBI illegally spied on my father's campaign and made up collusion story. Nissan D21. You saw what they did to"
3,"Works through all phases of illness, because it inhibits both viral replication and modulates. The immune response. Of note chloroquine phosphate or hydroxychloroquine identified in April 2020, could actually be a treatment is identified in The Proposal as a SARS Covey to inhibitor. What does all that mean? In the internal documents, the government was passing around. It was showing that"
4,"Free People, which was the freedom to choose Milton Friedman idea that that democracy is voting on the color of your tie, which is a quote, and it was the tenants of this project for privatisation of the public sphere deregulation of the financial biron and everything that would release Capital to be as free as possible and austerity in the public sphere. And, of course, that that was accompanied by Matt criminalization and a divestment, from all of the time. It's an estate that actually help people. And"


# Diversion - peek at KG entities

In [40]:
kg_ents = pd.read_csv("data/kg-entities.csv")
kg_ents

Unnamed: 0,label,risk_level,classification,narratives
0,ᛋᛋ,3,Dog Whistle,"{""White Supremacy""}"
1,✡👃,4,Dog Whistle,{Antisemitism}
2,0b@ma,3,Dog Whistle,"{Anti-Black,QAnon}"
3,€0ViD,3,Dog Whistle,{COVID-Denialism}
4,10 days of darkness,3,Dog Whistle,"{QAnon,""US Election Integrity""}"
...,...,...,...,...
3144,黃媒黑記,3,Dog Whistle,"{""HK-Anti-Democracy Protesters""}"
3145,黃獨黑暴,3,Dog Whistle,"{""HK-Anti-Democracy Protesters""}"
3146,黃絲,3,Dog Whistle,"{""HK-Anti-Democracy Protesters""}"
3147,黑學生害死香港,3,Phrase or slogan,"{""HK-Anti-Democracy Protesters""}"


In [41]:
kg_ents['narratives'].value_counts()

{Anti-Vaccine}                                                                               542
{"COVID-Pandemic Policies"}                                                                  239
{COVID-Denialism}                                                                            200
{Antisemitism}                                                                               164
{"White Supremacy"}                                                                          128
                                                                                            ... 
{"German-Far Right",Misogyny}                                                                  1
{"Global Control Conspiracies",QAnon,US-Militia}                                               1
{Anti-Black,"German-Far Right","German-Migrants & Refugees","Global-Migrants & Refugees"}      1
{"COVID-Pandemic Policies",Islamophobia}                                                       1
{"HK-Anti-Democracy Protesters

In [88]:
# kg_ents['narratives'].unique()

In [14]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting torchvision
  Downloading torchvision-0.12.0-cp37-cp37m-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 KB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tok

In [18]:
# # load model and tokenize a sentence
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("../models/xlm-roberta-base")

In [17]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=615.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




In [37]:
text = "Vaccines provide immunity"
tokenizer.tokenize(text)

['▁Vac', 'cine', 's', '▁provide', '▁immun', 'ity']