In [185]:
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
import os
import re
import nltk
import requests


In [18]:
import numpy as np
import nltk
import re


In [97]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/haarrublar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
## loading the data without validation set (maybe in a second example)

# training and testing data
with open('X_train.txt') as train, open('X_test.txt') as test:
    X_train = train.read()
    X_test = test.read()

# training output
y_train = np.load('y_train.npy')

In [129]:
X_train



In [132]:
## cleaning the data

stop = nltk.corpus.stopwords.words('english')

# inputing the stopwords in the english vocab using regular expressions for avoiding for loops 
pattern = r'\b(?:' + '\s*|'.join(map(re.escape, stop)) + r')\b' + r'|[^\w\s]'

clean_data = [None]
for _ in range(1):
    clean_data[0] = X_train.lower().split('\n')
    clean_data[0] = list(map(lambda x: re.sub(pattern, '', x), clean_data[0]))

In [133]:
clean_data

[['came witches stayed gwent',
  'great horror game get little repetitive new fresh set maps ideallevel system trash never confused 2 addons offerings perk systemmajor downfall matchmaking system thrown level 39 killer 3 4 survivors level 1',
  'good',
  'amazing',
  'looking fun game play game game  constant laughs funny references combined action packed game play open world full nooks crannies explore makes game one favouritesgame created many memorable moments worth extremely low priceverdict  910',
  'best 3rd person survival game opinion especially dlcs',
  'good hungry jack game',
  'amazing game',
  'great game play like everyday recomend everyone',
  'full action packed fun',
  'sticks true original using hd textures one thing would actually like see rewing ability found xbla version',
  'nice racing gamesadly mp somewhat dead recomend play friends',
  'freakin crazy epic awesome adventureperfect puzzle game keep coming back ',
  'ive played dlc good 2 hours made memory fond go

BERT tokenization does involve a dictionary, but it's not a traditional word-to-meaning dictionary. Here's the breakdown:

- No Simple Split: BERT uses a special technique called WordPiece tokenization. Instead of directly splitting sentences into words, it breaks them down into smaller units – subwords. These subwords can be whole words or parts of words.
- Dictionary for Subwords: The dictionary in BERT's tokenizer is called a vocabulary. It maps each subword to a unique ID. This allows the model to efficiently represent words and even unknown words that can be constructed from known subwords.
- Benefits of Subword Tokenization: This approach offers advantages:
Handling Unknown Words: By breaking down words into subwords, BERT can handle words not present in its vocabulary. The model can represent them using combinations of known subwords (e.g., "running" can be represented as "run" and "##ing").
- Efficiency: Subword units are often smaller than full words, leading to a more compact vocabulary and potentially faster training.

Here's an analogy: Imagine a dictionary with syllables instead of whole words. You can use syllables to represent most words, even if the exact word isn't there. BERT's vocabulary works similarly with subwords.

In [232]:
# creating the vocab from the github tensorflow vocabulary

with open('vocab.txt', 'r') as f:
    lines = f.readlines()

with open('BERTvocab.txt', 'w') as f:
    for line in lines:
        new_line = 'b"' + line.strip() + '",\n'
        f.write(new_line)



## data

In [2]:
_VOCAB = [
    b"[UNK]", b"[MASK]", b"[RANDOM]", b"[CLS]", b"[SEP]", b"[PAD]", b"[START]", b"[END]", b"!", b"#", b"$", b"%", b"&", b"'", b"(", b")", b"+", b",", b"-", b".", b"/", b"0", b"1", b"2", b"3", b"4", b"5", b"6", b"7", b"8", b"9", b":", b";", b"=", b"?", b"@", b"[", b"]", b"^", b"_", b"`", b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i", b"j", b"k", b"l", b"m", b"n", b"o", b"p", b"q", b"r", b"s", b"t", b"u", b"v", b"w", b"x", b"y", b"z", b"the", b"and", b"to", b"of", b"that", b"it", b"in", b"we", b"you", b"is", b"this", b"so", b"they", b"was", b"for", b"are", b"but", b"##s", b"have", b"what", b"do", b"on", b"with", b"can", b"about", b"there", b"be", b"not", b"my", b"as", b"all", b"at", b"one", b"people", b"re", b"like", b"if", b"our", b"from", b"now", b"an", b"just", b"or", b"me", b"he", b"these", b"when", b"by", b"how", b"because", b"more", b"out", b"very", b"them", b"see", b"had", b"would", b"their", b"were", b"up", b"going", b"know", b"think", b"us", b"your", b"who", b"here", b"##ing", b"time", b"really", b"get", b"ve", b"world", b"has", b"could", b"then", b"some", b"which", b"did", b"actually", b"where", b"way", b"will", b"no", b"other", b"into", b"well", b"want", b"##ed", b"years", b"make", b"been", b"those", b"go", b"two", b"also", b"things", b"first", b"right", b"much", b"than", b"even", b"something", b"new", b"she", b"look", b"laughter", b"##d", b"only", b"many", b"need", b"little", b"life", b"take", b"his", b"let", b"over", b"##ly", b"applause", b"most", b"got", b"why", b"back", b"thing", b"work", b"does", b"said", b"every", b"lot", b"different", b"around", b"thank", b"say", b"day", b"good", b"her", b"through", b"today", b"same", b"down", b"come", b"use", b"year", b"percent", b"kind", b"ll", b"ca", b"three", b"called", b"made", b"after", b"being", b"change", b"tell", b"any", b"human", b"##er", b"find", b"talk", b"##e", b"own", b"started", b"doing", b"should", b"still", b"idea", b"fact", b"together", b"put", b"##y", b"better", b"might", b"never", b"before", b"another", b"each", b"its", b"great", b"problem", b"last", b"example", b"went", b"system", b"course", b"big", b"##al", b"part", b"##r", b"give", b"start", b"next", b"too", b"him", b"able", b"few", b"off", b"brain", b"story", b"##n", b"000", b"##t", b"important", b"again", b"long", b"thought", b"used", b"school", b"place", b"women", b"found", b"understand", b"##es", b"show", b"between", b"maybe", b"mean", b"data", b"ago", b"came", b"technology", b"point", b"question", b"bit", b"old", b"children", b"information", b"real", b"always", b"everything", b"help", b"live", b"love", b"##a", b"end", b"means", b"call", b"wanted", b"10", b"feel", b"ever", b"away", b"country", b"number", b"person", b"home", b"space", b"done", b"water", b"power", b"looking", b"believe", b"social", b"future", b"may", b"million", b"times", b"imagine", b"using", b"create", b"four", b"small", b"five", b"without", b"become", b"second", b"best", b"less", b"money", b"left", b"am", b"working", b"science", b"##ers", b"comes", b"lives", b"whole", b"city", b"learn", b"thinking", b"trying", b"talking", b"ask", b"energy", b"kids", b"making", b"took", b"across", b"high", b"days", b"food", b"getting", b"such", b"told", b"hard", b"try", b"body", b"family", b"moment", b"health", b"man", b"music", b"okay", b"happen", b"interesting", b"light", b"##o", b"global", b"happened", b"young", b"makes", b"enough", b"pretty", b"video", b"almost", b"case", b"hand", b"side", b"##i", b"sort", b"building", b"ways", b"##l", b"simple", b"half", b"countries", b"yet", b"build", b"often", b"project", b"saw", b"sense", b"myself", b"care", b"inside", b"quite", b"later", b"matter", b"##able", b"asked", b"room", b"once", b"friends", b"probably", b"else", b"open", b"while", b"experience", b"already", b"happens", b"remember", b"both", b"far", b"self", b"public", b"computer", b"having", b"move", b"wrong", b"##ness", b"answer", b"goes", b"internet", b"living", b"words", b"car", b"education", b"coming", b"reason", b"until", b"anything", b"stop", b"community", b"dollars", b"level", b"looked", b"##ation", b"20", b"age", b"design", b"set", b"states", b"amazing", b"men", b"billion", b"face", b"nothing", b"business", b"picture", b"##h", b"read", b"someone", b"earth", b"exactly", b"seen", b"possible", b"under", b"society", b"keep", b"learned", b"works", b"bad", b"process", b"art", b"students", b"became", b"everybody", b"whether", b"history", b"true", b"yes", b"stuff", b"within", b"##ic", b"##k", b"##on", b"mind", b"questions", b"sometimes", b"language", b"single", b"stories", b"control", b"sure", b"black", b"job", b"knew", b"though", b"africa", b"completely", b"cells", b"instead", b"others", b"since", b"##m", b"built", b"everyone", b"learning", b"months", b"must", b"child", b"says", b"six", b"united", b"changed", b"heard", b"war", b"hope", b"group", b"mother", b"run", b"form", b"large", b"share", b"state", b"oh", b"book", b"past", b"play", b"universe", b"##in", b"places", b"sound", b"##ment", b"basically", b"decided", b"free", b"government", b"order", b"research", b"##ity", b"looks", b"night", b"news", b"taking", b"reality", b"turns", b"bring", b"company", b"30", b"turn", b"100", b"against", b"saying", b"top", b"##ive", b"itself", b"easy", b"gets", b"middle", b"natural", b"couple", b"lots", b"takes", b"##rs", b"companies", b"model", b"nature", b"ourselves", b"50", b"early", b"name", b"problems", b"woman", b"study", b"yeah", b"cars", b"century", b"cities", b"kinds", b"planet", b"species", b"word", b"disease", b"cancer", b"dna", b"hear", b"##th", b"based", b"head", b"huge", b"powerful", b"size", b"themselves", b"thousands", b"beautiful", b"created", b"difficult", b"felt", b"line", b"piece", b"##p", b"air", b"hours", b"##le", b"finally", b"image", b"trust", b"##an", b"ideas", b"##ry", b"##ting", b"happening", b"voice", b"worked", b"ones", b"leave", b"lost", b"eight", b"fear", b"somebody", b"american", b"cost", b"perhaps", b"behind", b"environment", b"guy", b"##ted", b"gave", b"needs", b"particular", b"##us", b"america", b"outside", b"value", b"##ion", b"full", b"rather", b"economic", b"per", b"scale", b"third", b"longer", b"low", b"needed", b"parents", b"violence", b"beginning", b"eyes", b"heart", b"physical", b"##g", b"##ized", b"difference", b"father", b"complex", b"entire", b"week", b"least", b"machine", b"friend", b"challenge", b"economy", b"numbers", b"test", b"blood", b"china", b"given", b"known", b"met", b"along", b"during", b"happy", b"pay", b"simply", b"york", b"15", b"##nt", b"front", b"key", b"seven", b"step", b"technologies", b"yourself", b"##b", b"##ies", b"population", b"##or", b"cell", b"media", b"##ate", b"culture", b"field", b"green", b"realized", b"anyone", b"humans", b"minutes", b"seeing", b"knowledge", b"network", b"systems", b"alone", b"taken", b"##ally", b"##less", b"animals", b"began", b"figure", b"god", b"speak", b"tools", b"write", b"behavior", b"born", b"clear", b"close", b"house", b"incredible", b"phone", b"teachers", b"##ian", b"images", b"individual", b"amount", b"deal", b"normal", b"absolutely", b"books", b"hundreds", b"political", b"turned", b"watch", b"ted", b"walk", b"local", b"grow", b"opportunity", b"road", b"tried", b"white", b"feeling", b"street", b"##et", b"access", b"either", b"poor", b"wall", b"##ism", b"##ist", b"carbon", b"hands", b"lived", b"nuclear", b"online", b"team", b"##ter", b"##tion", b"industry", b"law", b"death", b"ground", b"growth", b"oil", b"personal", b"red", b"rest", b"certain", b"dark", b"europe", b"millions", b"rate", b"stay", b"##st", b"common", b"digital", b"eat", b"girl", b"type", b"view", b"whatever", b"risk", b"scientists", b"terms", b"weeks", b"realize", b"support", b"##c", b"act", b"climate", b"growing", b"neurons", b"patients", b"seems", b"teach", b"##u", b"audience", b"india", b"sounds", b"structure", b"telling", b"tiny", b"wonderful", b"##ts", b"changes", b"developed", b"present", b"recently", b"solution", b"stand", b"allow", b"fast", b"gone", b"guys", b"parts", b"similar", b"starting", b"dream", b"short", b"changing", b"class", b"interested", b"kid", b"map", b"quickly", b"understanding", b"##ling", b"brought", b"drive", b"english", b"google", b"hold", b"modern", b"moving", b"spend", b"sun", b"##ble", b"indeed", b"message", b"paper", b"##en", b"##est", b"##ize", b"##ous", b"40", b"animal", b"code", b"color", b"everywhere", b"lab", b"national", b"explain", b"girls", b"ok", b"schools", b"solve", b"university", b"##ful", b"##ia", b"ability", b"area", b"land", b"mental", b"result", b"save", b"shows", b"theory", b"truth", b"web", b"##ure", b"average", b"biggest", b"buy", b"journey", b"areas", b"cool", b"crazy", b"eventually", b"forward", b"giving", b"hit", b"impact", b"incredibly", b"literally", b"material", b"response", b"sex", b"asking", b"available", b"computers", b"development", b"hundred", b"security", b"##it", b"issue", b"revolution", b"societies", b"stage", b"wrote", b"##ar", b"##ary", b"center", b"meet", b"miles", b"obviously", b"south", b"term", b"writing", b"##ial", b"choice", b"creating", b"discovered", b"sitting", b"##ine", b"cause", b"deep", b"groups", b"nobody", b"ready", b"several", b"situation", b"starts", b"beyond", b"buildings", b"cut", b"democracy", b"especially", b"plant", b"running", b"worth", b"action", b"baby", b"higher", b"hour", b"likely", b"major", b"nine", b"perfect", b"role", b"sea", b"add", b"begin", b"continue", b"die", b"finding", b"patient", b"reasons", b"send", b"source", b"spent", b"##man", b"attention", b"bottom", b"developing", b"device", b"seem", b"showed", b"bigger", b"box", b"dead", b"experiment", b"market", b"nice", b"object", b"pain", b"product", b"##re", b"##se", b"##ty", b"chinese", b"connected", b"loved", b"mom", b"networks", b"produce", b"solar", b"surface", b"teacher", b"wo", b"25", b"eye", b"institutions", b"special", b"##ant", b"60", b"becomes", b"east", b"effect", b"further", b"genes", b"guess", b"movement", b"suddenly", b"blue", b"fun", b"gives", b"knows", b"older", b"rules", b"son", b"choose", b"consider", b"extraordinary", b"focus", b"generation", b"killed", b"patterns", b"putting", b"scientific", b"wait", b"##0", b"##ated", b"##ce", b"##te", b"families", b"freedom", b"innovation", b"math", b"medical", b"soon", b"program", b"safe", b"shared", b"traditional", b"wife", b"##age", b"##ge", b"##um", b"crisis", b"gas", b"medicine", b"notice", b"pictures", b"showing", b"college", b"died", b"force", b"largest", b"leaders", b"models", b"perspective", b"pick", b"training", b"truly", b"##4", b"among", b"approach", b"blind", b"electricity", b"faster", b"grew", b"meant", b"park", b"potential", b"resources", b"results", b"sit", b"wish", b"##is", b"##ping", b"##ve", b"##z", b"anybody", b"anymore", b"communities", b"decision", b"exciting", b"favorite", b"fly", b"led", b"mass", b"measure", b"moved", b"smart", b"student", b"teaching", b"totally", b"towards", b"written", b"chance", b"creative", b"designed", b"evidence", b"fight", b"however", b"larger", b"late", b"minute", b"plan", b"politics", b"quality", b"speed", b"spread", b"thousand", b"##ent", b"##land", b"break", b"examples", b"follow", b"lead", b"listen", b"materials", b"please", b"relationships", b"somewhere", b"strong", b"success", b"##as", b"brains", b"certainly", b"doctor", b"effective", b"game", b"general", b"hospital", b"individuals", b"month", b"morning", b"reach", b"talked", b"thanks", b"unique", b"##7", b"##king", b"african", b"conversation", b"governments", b"issues", b"malaria", b"ocean", b"particularly", b"poverty", b"rights", b"successful", b"taught", b"visual", b"wonder", b"biology", b"forest", b"physics", b"provide", b"relationship", b"##ped", b"##red", b"allowed", b"architecture", b"drug", b"film", b"mobile", b"near", b"shape", b"speaking", b"worse", b"11", b"12", b"driving", b"fire", b"humanity", b"military", b"museum", b"page", b"pieces", b"wants", b"##6", b"##8", b"##ns", b"basic", b"exist", b"fall", b"office", b"police", b"subject", b"travel", b"##ch", b"##el", b"##ors", b"200", b"90", b"actual", b"boy", b"clearly", b"cold", b"costs", b"expect", b"facebook", b"feet", b"main", b"non", b"organization", b"reading", b"##ating", b"##ding", b"##f", b"##ging", b"##ll", b"becoming", b"capital", b"dance", b"diseases", b"increase", b"policy", b"song", b"standing", b"supposed", b"walking", b"##1", b"##ists", b"18", b"allows", b"attack", b"camera", b"compassion", b"complicated", b"develop", b"direction", b"expensive", b"lines", b"listening", b"onto", b"path", b"photos", b"range", b"##2", b"##3", b"##at", b"##na", b"##nce", b"##ual", b"alive", b"cases", b"connection", b"decades", b"door", b"drugs", b"lower", b"peace", b"practice", b"recognize", b"usually", b"##ke", b"##ne", b"##ring", b"80", b"above", b"double", b"essentially", b"experiences", b"mine", b"remarkable", b"sent", b"smaller", b"sorts", b"tool", b"understood", b"west", b"17", b"bodies", b"degrees", b"devices", b"improve", b"involved", b"list", b"points", b"protect", b"site", b"##led", b"##line", b"artist", b"brand", b"fish", b"following", b"fundamental", b"helped", b"international", b"kept", b"matters", b"record", b"shown", b"tells", b"trees", b"village", b"vision", b"##ance", b"##ck", b"##il", b"##ization", b"ancient", b"anyway", b"clean", b"colleagues", b"gps", b"himself", b"hole", b"including", b"insects", b"interest", b"jobs", b"lose", b"screen", b"skin", b"surgery", b"tend", b"treatment", b"wind", b"##ged", b"##ra", b"central", b"challenges", b"closer", b"critical", b"dangerous", b"familiar", b"female", b"genome", b"glass", b"highly", b"nearly", b"north", b"pattern", b"position", b"private", b"products", b"robot", b"zero", b"##ious", b"##w", b"500", b"bank", b"complete", b"concept", b"doctors", b"evolution", b"financial", b"impossible", b"income", b"mr", b"somehow", b"specific", b"watching", b"workers", b"bed", b"bees", b"block", b"context", b"function", b"greatest", b"invisible", b"magic", b"massive", b"passwords", b"price", b"production", b"religion", b"silk", b"speech", b"square", b"table", b"text", b"trade", b"victims", b"##ks", b"##ma", b"##tor", b"coal", b"fuel", b"goal", b"hey", b"knowing", b"memory", b"paid", b"post", b"saving", b"sharing", b"unfortunately", b"##9", b"##ving", b"campaign", b"citizens", b"daughter", b"emotional", b"era", b"fix", b"held", b"passion", b"photo", b"raise", b"recent", b"serious", b"slow", b"software", b"streets", b"terrible", b"waiting", b"##izing", b"##x", b"13", b"70", b"board", b"bringing", b"dad", b"engage", b"excited", b"fine", b"hot", b"immediately", b"letter", b"molecules", b"objects", b"projects", b"ran", b"secret", b"useful", b"weight", b"##ct", b"##ible", b"##ile", b"##ton", b"brings", b"communicate", b"farmers", b"genetic", b"hair", b"meaning", b"members", b"moon", b"obvious", b"painting", b"prime", b"quantum", b"return", b"service", b"services", b"shift", b"tv", b"##nd", b"##ration", b"afghanistan", b"ahead", b"beings", b"conditions", b"earlier", b"epidemic", b"floor", b"forms", b"gene", b"greater", b"investment", b"king", b"noise", b"president", b"pressure", b"region", b"star", b"stars", b"stopped", b"apply", b"bacteria", b"chris", b"classroom", b"communication", b"cultural", b"decade", b"easier", b"failure", b"focused", b"forget", b"ice", b"kill", b"markets", b"otherwise", b"rain", b"rise", b"solutions", b"straight", b"strange", b"survive", b"traffic", b"urban", b"##ie", b"##les", b"##per", b"300", b"degree", b"experiments", b"extremely", b"mechanical", b"noticed", b"paint", b"playing", b"rich", b"sat", b"search", b"seemed", b"sky", b"therefore", b"worry", b"##id", b"answers", b"bill", b"connect", b"effects", b"fashion", b"healthy", b"series", b"sorry", b"surprising", b"sustainable", b"throughout", b"win", b"##os", b"address", b"cheap", b"check", b"civil", b"complexity", b"conflict", b"engine", b"equal", b"event", b"flow", b"infrastructure", b"laws", b"lesson", b"mathematics", b"particles", b"tissue", b"touch", b"although", b"broken", b"creativity", b"dimensional", b"directly", b"european", b"except", b"extreme", b"fully", b"husband", b"lie", b"machines", b"positive", b"risks", b"scientist", b"tree", b"visit", b"##cy", b"##ds", b"##ities", b"##ning", b"##ro", b"##ue", b"24", b"anywhere", b"blocks", b"brother", b"bunch", b"designers", b"despite", b"easily", b"email", b"feed", b"feels", b"fit", b"higgs", b"legs", b"loss", b"multiple", b"pass", b"period", b"pull", b"push", b"stress", b"##ence", b"##me", b"##ments", b"##ps", b"##up", b"##way", b"apart", b"aware", b"beauty", b"begins", b"bread", b"bridge", b"career", b"decide", b"differently", b"draw", b"driver", b"edge", b"famous", b"heat", b"holding", b"iran", b"protection", b"sign", b"skills", b"sleep", b"slightly", b"spider", b"studies", b"target", b"total", b"town", b"train", b"trial", b"western", b"##hip", b"##ls", b"##ny", b"##ters", b"afford", b"afraid", b"artists", b"believed", b"below", b"birds", b"birth", b"capacity", b"content", b"elements", b"feelings", b"forever", b"helping", b"holes", b"illness", b"importantly", b"industrial", b"john", b"justice", b"kilometers", b"meters", b"mostly", b"named", b"original", b"plants", b"popular", b"responsibility", b"river", b"shot", b"spaces", b"types", b"upon", b"waste", b"##ans", b"##de", b"affect", b"affected", b"army", b"ball", b"bang", b"click", b"depression", b"ended", b"everyday", b"invented", b"necessary", b"nervous", b"possibly", b"press", b"prison", b"related", b"seconds", b"sight", b"store", b"surprise", b"turning", b"uses", b"won", b"##cing", b"##om", b"##ur", b"##zing", b"basis", b"current", b"decisions", b"dots", b"engineering", b"evolved", b"feedback", b"forth", b"instance", b"israel", b"languages", b"levels", b"method", b"microbes", b"none", b"opposite", b"performance", b"robots", b"sand", b"studied", b"temperature", b"twitter", b"values", b"version", b"virtual", b"walked", b"whose", b"##bs", b"##sing", b"##tic", b"california", b"cards", b"chemistry", b"choices", b"consciousness", b"core", b"count", b"creates", b"drop", b"eating", b"electric", b"final", b"happiness", b"intelligence", b"joy", b"leaves", b"letters", b"random", b"walls", b"willing", b"##ative", b"##ets", b"##ey", b"##ned", b"##ries", b"##ular", b"16", b"accept", b"agree", b"arab", b"benefits", b"british", b"cross", b"daily", b"desire", b"discover", b"dog", b"dollar", b"efficient", b"enormous", b"gender", b"hearing", b"inspired", b"lack", b"leading", b"london", b"majority", b"messages", b"moral", b"nation", b"oxytocin", b"professor", b"raised", b"respect", b"san", b"sentence", b"sister", b"structures", b"track", b"transform", b"ultimately", b"vote", b"wide", b"worst", b"##ish", b"##istic", b"##sion", b"##uring", b"brazil", b"caught", b"collective", b"connections", b"curious", b"differences", b"distance", b"due", b"economics", b"essential", b"extra", b"fantastic", b"harder", b"imagination", b"lying", b"minds", b"organizations", b"password", b"progress", b"prove", b"received", b"religious", b"saved", b"sell", b"slowly", b"soil", b"stuck", b"summer", b"twice", b"whom", b"##5", b"##ins", b"##ium", b"##rate", b"##ut", b"americans", b"applications", b"avoid", b"carry", b"chain", b"cognitive", b"describe", b"described", b"effectively", b"einstein", b"em", b"error", b"etc", b"facts", b"fail", b"fat", b"forces", b"gap", b"graph", b"landscape", b"manage", b"managed", b"motion", b"passed", b"purpose", b"quick", b"radio", b"report", b"respond", b"sequence", b"spending", b"standard", b"super", b"tough", b"treat", b"trillion", b"various", b"voices", b"watched", b"website", b"window", b"##ens", b"##ier", b"##sh", b"##tes", b"14", b"3d", b"active", b"arm", b"babies", b"cheaper", b"compared", b"details", b"dry", b"dung", b"experienced", b"explore", b"fascinating", b"foundation", b"ignorance", b"join", b"moments", b"operating", b"played", b"predict", b"privacy", b"secure", b"silence", b"suffering", b"task", b"tons", b"trained", b"valuable", b"vast", b"##ction", b"##lin", b"20th", b"accident", b"according", b"aging", b"apple", b"chemical", b"citizen", b"corner", b"cure", b"damage", b"deliver", b"documents", b"effort", b"failed", b"fairly", b"foreign", b"identity", b"influence", b"lights", b"neighborhood", b"normally", b"race", b"release", b"rock", b"rule", b"safety", b"sick", b"sides", b"signal", b"spirit", b"underneath", b"vehicle", b"##ants", b"##ations", b"##ite", b"##ler", b"##ner", b"##tle", b"2008", b"awesome", b"behaviors", b"billions", b"biological", b"bits", b"bubble", b"calling", b"carefully", b"causes", b"charge", b"compare", b"copyright", b"definition", b"discovery", b"economies", b"entropy", b"faces", b"factors", b"faith", b"fake", b"gotten", b"grade", b"gut", b"heads", b"immune", b"iraq", b"male", b"marketing", b"meat", b"mountain", b"narrative", b"opportunities", b"peer", b"phenomenon", b"phones", b"plus", b"produced", b"rural", b"savings", b"sexual", b"stick", b"suicide", b"terrorism", b"vaccines", b"virus", b"wealth", b"##ad", b"##nts", b"##out", b"##rating", b"##ulate", b"##ves", b"activity", b"arms", b"atmosphere", b"belief", b"canada", b"civilization", b"cloud", b"condition", b"currently", b"factory", b"flu", b"followed", b"france", b"galaxies", b"giant", b"gold", b"hate", b"institution", b"japan", b"kenya", b"launched", b"lucky", b"mars", b"mathematical", b"molecule", b"mosquitos", b"photograph", b"poetry", b"principles", b"published", b"roll", b"struggle", b"teeth", b"##ably", b"##ked", b"##ot", b"##pe", b"##rt", b"##son", b"aid", b"balance", b"bought", b"brown", b"calls", b"cat", b"characters", b"childhood", b"chip", b"colors", b"date", b"deeply", b"enjoy", b"false", b"french", b"hiv", b"limited", b"match", b"missing", b"mission", b"moves", b"net", b"novel", b"opened", b"pages", b"papers", b"party", b"patent", b"politicians", b"programs", b"religions", b"reputation", b"scratch", b"sites", b"strangers", b"surprised", b"weather", b"##ified", b"##med", b"##tive", b"2010", b"asia", b"bear", b"blah", b"britain", b"caused", b"character", b"circle", b"collect", b"depends", b"destruction", b"direct", b"dr", b"egypt", b"election", b"entirely", b"events", b"fair", b"fewer", b"forced", b"gdp", b"germany", b"gift", b"homes", b"layer", b"leaving", b"mark", b"meeting", b"mouse", b"movie", b"paris", b"possibilities", b"possibility", b"prize", b"radiation", b"roads", b"scared", b"september", b"shame", b"signals", b"slide", b"threat", b"toilet", b"tumor", b"unit", b"videos", b"waves", b"##da", b"##ical", b"##ology", b"account", b"artificial", b"background", b"beat", b"brilliant", b"closed", b"conference", b"contact", b"cooking", b"creation", b"cyrus", b"dimensions", b"disability", b"domestic", b"dot", b"dying", b"elephant", b"employees", b"empty", b"hospitals", b"illegal", b"invest", b"lies", b"nations", b"passionate", b"platform", b"pop", b"protein", b"recorded", b"separate", b"songs", b"studying", b"survey", b"testing", b"trip", b"universal", b"violent", b"worldwide", b"##by", b"##dy", b"##ics", b"##ms", b"##one", b"##que", b"##ric", b"##tions", b"##to", b"##ver", b"advanced", b"amazon", b"applied", b"axis", b"boys", b"broke", b"centers", b"considered", b"conversations", b"correct", b"de", b"dealing", b"dinner", b"dreams", b"driven", b"elections", b"facing", b"fighting", b"funny", b"grandfather", b"grandmother", b"helps", b"hidden", b"honest", b"hum", b"identical", b"keeps", b"labor", b"mexico", b"names", b"offer", b"ordinary", b"outcomes", b"planets", b"professional", b"profound", b"proteins", b"reaction", b"remain", b"significant", b"smell", b"sold", b"stood", b"suffer", b"supply", b"switch", b"typical", b"unless", b"versus", b"younger", b"##am", b"##ft", b"##ram", b"##sed", b"##tors", b"##v", b"actions", b"argue", b"australia", b"capable", b"capture", b"constantly", b"cover", b"efficiency", b"emotions", b"ends", b"engineers", b"environmental", b"fell", b"filled", b"fundamentally", b"generations", b"hopefully", b"increasingly", b"insight", b"interact", b"notion", b"partner", b"previous", b"proud", b"shoes", b"soft", b"survival", b"talks", b"tomorrow", b"transportation", b"viruses", b"wearing", b"youtube", b"##ak", b"##ay", b"##des", b"##place", b"##sts", b"##time", b"##unk", b"150", b"400", b"acting", b"advice", b"awareness", b"bright", b"card", b"concrete", b"convinced", b"crispr", b"debate", b"deeper", b"doubt", b"ears", b"expected", b"explained", b"expression", b"farm", b"foot", b"grown", b"hi", b"hotel", b"leader", b"mentioned", b"mouth", b"mystery", b"rare", b"repeat", b"requires", b"responsible", b"rooms", b"scene", b"seriously", b"sets", b"ship", b"signs", b"silent", b"statistics", b"therapy", b"thinks", b"tracking", b"treated", b"variation", b"wow", b"yellow", b"##ber", b"##cent", b"##cted", b"##ians", b"##ions", b"##ose", b"##pt", b"##ta", b"##tain", b"##ud", b"##ulated", b"##ward", b"##ze", b"2009", b"21st", b"accurate", b"added", b"adults", b"advantage", b"anger", b"application", b"argument", b"battle", b"benefit", b"catch", b"comfortable", b"constant", b"court", b"curve", b"danger", b"demand", b"designing", b"disorders", b"diversity", b"experts", b"exploration", b"fellow", b"fossil", b"fourth", b"garden", b"goods", b"haiti", b"honor", b"laboratory", b"leads", b"legal", b"loud", b"manufacturing", b"maps", b"member", b"mile", b"organized", b"perfectly", b"personally", b"practical", b"promise", b"relevant", b"repair", b"represent", b"require", b"serve", b"steps", b"strength", b"struck", b"sudden", b"suggest", b"sum", b"ten", b"tens", b"trouble", b"vulnerable", b"windows", b"##ability", b"##ir", b"##its", b"##low", b"##ri", b"##ten", b"##ures", b"2011", b"75", b"abuse", b"achieve", b"agriculture", b"atoms", b"bomb", b"bone", b"businesses", b"closely", b"consequences", b"deaths", b"democratic", b"disappear", b"disaster", b"educational", b"emissions", b"fibers", b"gain", b"generally", b"gray", b"hell", b"interests", b"jump", b"killing", b"march", b"mistake", b"mistakes", b"naturally", b"necessarily", b"oceans", b"pig", b"pleasure", b"policies", b"properties", b"pulled", b"quiet", b"ride", b"sanitation", b"sending", b"shapes", b"soul", b"statement", b"station", b"volume", b"wave", b"welcome", b"wondering", b"##ard", b"##bility", b"##car", b"##ery", b"##go", b"##hed", b"##ip", b"##ished", b"##lo", b"##ons", b"##rable", b"##rd", b"##ses", b"##sive", b"##sm", b"##ute", b"coffee", b"connectome", b"cortex", b"desert", b"ear", b"element", b"empathy", b"equality", b"equivalent", b"features", b"fields", b"figured", b"forests", b"frame", b"hall", b"hide", b"highest", b"increased", b"inner", b"jealousy", b"latin", b"lay", b"losing", b"magazine", b"management", b"marriage", b"married", b"meaningful", b"medium", b"mice", b"options", b"organ", b"principle", b"property", b"providing", b"remind", b"remote", b"sample", b"section", b"sees", b"sentences", b"solving", b"southern", b"specifically", b"stayed", b"stem", b"stronger", b"technical", b"techniques", b"transformation", b"transition", b"unlike", b"washington", b"wear", b"worried", b"##ah", b"##ama", b"##bly", b"##board", b"##ify", b"##ings", b"##ka", b"##light", b"##my", b"##ural", b"adding", b"aids", b"alzheimer", b"associated", b"base", b"beach", b"bet", b"bg", b"bitcoin", b"church", b"conscious", b"construction", b"consumers", b"contrast", b"curiosity", b"detect", b"distributed", b"drawing", b"ecosystem", b"embrace", b"equally", b"exact", b"existence", b"faced", b"generate", b"inspiration", b"introduce", b"memories", b"molecular", b"mortality", b"needle", b"neighbors", b"obsessed", b"opening", b"operate", b"particle", b"physically", b"potentially", b"processing", b"reward", b"richard", b"roughly", b"searching", b"shadow", b"shipping", b"solved", b"stone", b"strategy", b"thoughts", b"toward", b"traveling", b"trend", b"user", b"visible", b"voting", b"weapon", b"##als", b"##bed", b"##bia", b"##bit", b"##bling", b"##der", b"##ess", b"##her", b"##inal", b"##ncy", b"##ol", b"##ron", b"##val", b"affordable", b"amongst", b"app", b"challenging", b"channel", b"chose", b"classrooms", b"collection", b"consumer", b"consumption", b"copy", b"council", b"covered", b"department", b"drove", b"emerging", b"engaged", b"entrepreneurs", b"exchange", b"explanation", b"fiction", b"finished", b"formed", b"fought", b"grateful", b"gravity", b"humanitarian", b"hypothesis", b"indian", b"interactive", b"invited", b"lessons", b"los", b"microscope", b"organs", b"ours", b"perception", b"processes", b"puts", b"puzzle", b"reduce", b"resource", b"row", b"sad", b"score", b"shaped", b"spring", b"television", b"topic", b"tradition", b"ultimate", b"vaccine", b"variety", b"wanting", b"weapons", b"wild", b"wk", b"##ee", b"##ky", b"##la", b"##ming", b"##unt", b"addition", b"admit", b"ages", b"algorithm", b"alternative", b"analysis", b"anderson", b"anti", b"attacks", b"babbage", b"beetle", b"border", b"boxes", b"cable", b"chart", b"circumstances", b"claim", b"clinical", b"combine", b"commercial", b"cook", b"credit", b"crying", b"cutting", b"detail", b"dialogue", b"distribution", b"earthquake", b"elderly", b"electronics", b"exercise", b"finish", b"fund", b"galaxy", b"games", b"handle", b"heroes", b"innovative", b"interaction", b"interview", b"january", b"joined", b"link", b"mit", b"negative", b"opinion", b"pakistan", b"picked", b"planning", b"pounds", b"print", b"prostate", b"rates", b"replace", b"responses", b"round", b"samples", b"seat", b"skeleton", b"sources", b"staff", b"standards", b"status", b"surely", b"surgeon", b"technique", b"tied", b"tom", b"tonight", b"traveled", b"users", b"wake", b"wheelchair", b"zone", b"##ade", b"##ded", b"##ering", b"##ient", b"##ily", b"##im", b"##ise", b"##ivity", b"##ld", b"##men", b"##uck", b"##ulation", b"##va", b"19th", b"accessible", b"afternoon", b"angeles", b"anonymous", b"arrived", b"bird", b"breakfast", b"commitment", b"conflicts", b"crime", b"crucial", b"customers", b"deception", b"define", b"dense", b"disorder", b"dynamic", b"expand", b"flying", b"fuels", b"functions", b"gates", b"graduate", b"grows", b"gun", b"harvard", b"hits", b"implications", b"importance", b"infinite", b"input", b"interactions", b"internal", b"island", b"jail", b"james", b"lady", b"launch", b"lecture", b"lifetime", b"milk", b"nobel", b"overall", b"partners", b"personality", b"phrase", b"poorest", b"prosperity", b"radical", b"regular", b"released", b"required", b"returned", b"runs", b"scan", b"selling", b"sir", b"spiders", b"spreading", b"taste", b"technological", b"tissues", b"truck", b"worker", b"##ars", b"##ast", b"##fe", b"##gs", b"##ire", b"##ney", b"##rated", b"##rative", b"##sis", b"2000", b"absolute", b"aim", b"badly", b"barely", b"bee", b"birthday", b"bones", b"borders", b"busy", b"camp", b"coast", b"collected", b"confident", b"continent", b"contract", b"convince", b"counter", b"creatures", b"cylinder", b"destroyed", b"detection", b"discussion", b"document", b"dropped", b"essence", b"exists", b"francisco", b"george", b"globe", b"host", b"hurt", b"ii", b"include", b"industries", b"iphone", b"judge", b"leather", b"library", b"maintain", b"master", b"mechanism", b"nerves", b"nor", b"note", b"nowhere", b"offered", b"optimistic", b"option", b"oxygen", b"painful", b"paintings", b"pair", b"participate", b"peers", b"perform", b"plans", b"pm", b"practices", b"presented", b"printing", b"quote", b"remains", b"researchers", b"review", b"rid", b"saudi", b"scary", b"sector", b"signed", b"spectrum", b"ss", b"succeed", b"suggests", b"tested", b"transparency", b"tremendous", b"trick", b"uncle", b"units", b"usual", b"victim", b"wars", b"zoom", b"##ator", b"##ca", b"##day", b"##ell", b"##ger", b"##hood", b"##ition", b"##ners", b"##oid", b"##ow", b"##pers", b"##sa", b"##ulating", b"21", b"22", b"27", b"academy", b"accepted", b"agreed", b"ai", b"al", b"algorithms", b"aspects", b"bag", b"careful", b"colleague", b"committed", b"competition", b"concerned", b"courage", b"cultures", b"drinking", b"drives", b"ecosystems", b"emergency", b"engaging", b"england", b"enter", b"equipment", b"evening", b"expansion", b"fabric", b"figures", b"friday", b"gather", b"german", b"ghana", b"historical", b"hoping", b"houses", b"illusion", b"improvement", b"intelligent", b"june", b"keeping", b"leadership", b"location", b"loop", b"measured", b"missed", b"mothers", b"neuron", b"observe", b"occur", b"operation", b"paying", b"placed", b"pocket", b"pre", b"rapidly", b"recover", b"relatively", b"rely", b"represents", b"rice", b"setting", b"severe", b"soup", b"stands", b"stored", b"sub", b"sweet", b"symbol", b"symptoms", b"throw", b"transport", b"union", b"vehicles", b"views", b"weird", b"wondered", b"##ality", b"##aries", b"##ators", b"##ched", b"##end", b"##ibility", b"##ically", b"##ign", b"##io", b"##nation", b"##ological", b"##ologist", b"##ored", b"##ory", b"##ray", b"##ria", b"##rn", b"##tation", b"##tter", b"##uate", b"##ung", b"##ured", b"##ved", b"##zy", b"2012", b"45", b"additional", b"adult", b"aspect", b"battery", b"bicycle", b"bind", b"blockchain", b"capitalism", b"carried", b"clarity", b"classic", b"clever", b"combination", b"comet", b"computing", b"confidence", b"contribute", b"crops", b"cycle", b"dancing", b"determine", b"diagnosed", b"dignity", b"dozen", b"drawn", b"environments", b"expertise", b"express", b"fixed", b"fortunately", b"grand", b"heavy", b"holds", b"horse", b"hungry", b"identify", b"ignore", b"inevitable", b"infection", b"injustice", b"inspire", b"intellectual", b"interacting", b"introduced", b"largely", b"layers", b"limits", b"literature", b"mention", b"metaphor", b"micro", b"movies", b"neighborhoods", b"occasionally", b"odd", b"offices", b"organic", b"percentage", b"phase", b"poem", b"populations", b"posted", b"precisely", b"primary", b"productivity", b"provided", b"quarter", b"rats", b"sophisticated", b"speakers", b"steel", b"style", b"taliban", b"teenagers", b"thomas", b"treatments", b"uncomfortable", b"wisdom", b"yours", b"##ages", b"##atory", b"##fi", b"##ges", b"##let", b"##ography", b"##ome", b"##rine", b"##rts", b"##ss", b"##ture", b"##ug", b"##ule", b"##use", b"2007", b"35", b"allowing", b"angry", b"apartment", b"assume", b"awful", b"boat", b"burning", b"button", b"centuries", b"clock", b"co2", b"coach", b"coke", b"collapse", b"colony", b"contrary", b"crowd", b"david", b"delivered", b"deserve", b"designer", b"director", b"eastern", b"electronic", b"exposed", b"falling", b"falls", b"figuring", b"flexible", b"flowers", b"fold", b"fresh", b"globalization", b"globally", b"goals", b"granted", b"increasing", b"initial", b"instant", b"insurance", b"interface", b"islands", b"latest", b"leg", b"leonardo", b"linked", b"literacy", b"mad", b"methods", b"miss", b"mustache", b"nodes", b"organize", b"plenty", b"polio", b"precision", b"prevent", b"prisoners", b"raw", b"remove", b"removed", b"reverse", b"reviews", b"ring", b"rocket", b"roof", b"secondly", b"seeds", b"served", b"spinal", b"staying", b"swimming", b"telescope", b"universes", b"warming", b"wherever", b"writer", b"##ain", b"##atic", b"##etic", b"##ff", b"##ha", b"##ice", b"##iest", b"##ig", b"##iles", b"##lands", b"##load", b"##logy", b"##nia", b"##no", b"##ond", b"##oo", b"##point", b"##posing", b"##py", b"##rging", b"##room", b"##shed", b"##side", b"##stic", b"##ties", b"##uation", b"##uing", b"##uous", b"##ways", b"##wn", b"19", b"2001", b"2050", b"appreciate", b"aside", b"audio", b"cameras", b"capita", b"category", b"celebrate", b"cents", b"clip", b"clothes", b"collaborative", b"controlled", b"convey", b"courses", b"cup", b"cybercriminals", b"debt", b"definitely", b"drill", b"duck", b"edi", b"enable", b"encourage", b"enemy", b"existing", b"explaining", b"exploring", b"factor", b"farming", b"fascinated", b"flower", b"frankly", b"frozen", b"grab", b"grave", b"harm", b"height", b"instructions", b"investors", b"involves", b"joke", b"journalist", b"leap", b"length", b"liked", b"liquid", b"logic", b"manhattan", b"measuring", b"mid", b"movember", b"musical", b"native", b"neither", b"newspaper", b"notes", b"output", b"pairs", b"plastic", b"prepare", b"prepared", b"raising", b"rarely", b"reconstruct", b"restaurant", b"retina", b"retirement", b"rising", b"seek", b"shelter", b"spot", b"string", b"subjects", b"tap", b"theater", b"trials", b"tricks", b"trucks", b"uncertainty", b"van", b"whenever", b"whereas", b"##aging", b"##aking", b"##ale", b"##ana", b"##ase", b"##bi", b"##co", b"##een", b"##els", b"##ency", b"##fully", b"##fy", b"##house", b"##ide", b"##imize", b"##ina", b"##ined", b"##ites", b"##ives", b"##ley", b"##od", b"##ously", b"##rant", b"##try", b"##ution", b"##vation", b"##zed", b"##zi", b"250", b"800", b"amounts", b"ancestors", b"apparently", b"attempt", b"autism", b"bell", b"bow", b"breath", b"burden", b"cambridge", b"chicago", b"club", b"concepts", b"conclusion", b"congestion", b"crash", b"currency", b"damaged", b"deadly", b"delivery", b"diverse", b"dramatic", b"drivers", b"ecology", b"efforts", b"embedded", b"engineer", b"equation", b"existed", b"expanding", b"expert", b"fifth", b"fill", b"flight", b"floating", b"focusing", b"folks", b"frank", b"functional", b"funding", b"generated", b"genius", b"healthcare", b"horrible", b"increases", b"instantly", b"institute", b"jeopardy", b"ladies", b"lens", b"loves", b"metal", b"meter", b"mixed", b"motor", b"mt", b"nanopatch", b"neural", b"oyster", b"parties", b"penis", b"permission", b"physician", b"pink", b"plays", b"poet", b"recognition", b"refugees", b"relate", b"reminded", b"renewable", b"reveal", b"roots", b"sales", b"season", b"seeking", b"singing", b"stanford", b"static", b"struggling", b"stupid", b"subway", b"suggesting", b"sustainability", b"swim", b"ta", b"tape", b"tech", b"tesla", b"theme", b"transfer", b"translate", b"trends", b"underwater", b"universities", b"voters", b"walks", b"warm", b"youth", b"##acy", b"##are", b"##ates", b"##ave", b"##bing", b"##cus", b"##dle", b"##ered", b"##ility", b"##ively", b"##nate", b"##ntial", b"##ox", b"##pel", b"##rey", b"##ris", b"##tte", b"##uit", b"##work", b"2005", b"23", b"600", b"achieved", b"activist", b"activities", b"agenda", b"airplane", b"anatomy", b"angle", b"arts", b"association", b"bar", b"baseball", b"bats", b"beetles", b"behave", b"believing", b"blow", b"breaking", b"broad", b"bus", b"bush", b"calculating", b"carrying", b"cash", b"chaos", b"chips", b"climb", b"co", b"com", b"compete", b"concern", b"coral", b"corruption", b"cry", b"cute", b"defense", b"destroy", b"divided", b"doodling", b"drink", b"emotion", b"empowering", b"epidemics", b"errors", b"expressions", b"extract", b"fallen", b"farmer", b"fiber", b"flat", b"flip", b"forgotten", b"former", b"framework", b"generous", b"genomes", b"graduated", b"grid", b"guide", b"harbor", b"helpful", b"immediate", b"immigration", b"independent", b"influenced", b"invested", b"journalists", b"license", b"likes", b"lord", b"marry", b"mess", b"minister", b"mirror", b"monitor", b"multiverse", b"obesity", b"objective", b"origin", b"outcome", b"pace", b"parking", b"paul", b"pause", b"personalized", b"picking", b"pigs", b"pool", b"prototype", b"quit", b"rat", b"rebuild", b"recognized", b"recovery", b"reduced", b"represented", b"richer", b"schizophrenia", b"sciences", b"scores", b"sculpture", b"shit", b"shooting", b"similarly", b"split", b"spoke", b"stability", b"stations", b"surrounded", b"surrounding", b"targets", b"teenage", b"tony", b"tragedy", b"typically", b"ugly", b"ultrasound", b"unexpected", b"vital", b"wider", b"winter", b"wired", b"witness", b"worlds", b"yesterday", b"##ach", b"##ads", b"##amine", b"##bo", b"##ching", b"##cies", b"##eer", b"##ened", b"##fit", b"##hand", b"##ike", b"##iled", b"##itive", b"##lation", b"##mer", b"##mon", b"##olve", b"##otic", b"##race", b"##rian", b"##row", b"##rted", b"##ually", b"##ult", b"##un", b"##vable", b"##via", b"##we", b"2004", b"2015", b"90s", b"abstract", b"acts", b"adoption", b"affects", b"ah", b"analyze", b"announced", b"artwork", b"assembly", b"bathroom", b"beautifully", b"branches", b"breathe", b"brothers", b"buses", b"candidate", b"causing", b"ceo", b"collagen", b"collecting", b"comedy", b"comments", b"component", b"conclude", b"consent", b"convert", b"cooperate", b"description", b"diet", b"discuss", b"distant", b"divide", b"donor", b"doors", b"elected", b"ending", b"entrepreneur", b"established", b"facial", b"filter", b"fusion", b"grain", b"guilty", b"headed", b"hearts", b"herself", b"imaging", b"infected", b"ink", b"instruments", b"integrated", b"intervention", b"israeli", b"japanese", b"knock", b"la", b"lake", b"lets", b"lighting", b"luxury", b"mainly", b"makers", b"mathematician", b"methane", b"michael", b"microbial", b"morality", b"movements", b"naked", b"nasa", b"navigate", b"node", b"northern", b"nose", b"nutrition", b"observation", b"official", b"overcome", b"paradox", b"parent", b"participation", b"passive", b"peak", b"photographs", b"plane", b"prices", b"professionals", b"profit", b"properly", b"punch", b"reached", b"resistance", b"resolution", b"robotics", b"russia", b"sadly", b"safer", b"satellite", b"sensor", b"separation", b"shake", b"shock", b"shop", b"shopping", b"shoulder", b"sisters", b"sits", b"solid", b"speaker", b"stock", b"stream", b"stroke", b"studio", b"surgeons", b"surveillance", b"sweden", b"syria", b"tackle", b"tension", b"tests", b"thick", b"thin", b"threats", b"thrown", b"translation", b"unknown", b"unusual", b"versions", b"via", b"ward", b"wheel", b"##ail", b"##ari", b"##ational", b"##atts", b"##ba", b"##berg", b"##bert", b"##bits", b"##book", b"##cence", b"##dies", b"##eers", b"##ef", b"##ew", b"##form", b"##gen", b"##ica", b"##ification", b"##iling", b"##ils", b"##lan", b"##lap", b"##lic", b"##lities", b"##lock", b"##ncing", b"##nes", b"##off", b"##orn", b"##rises", b"##rp", b"##ti", b"##ucked", b"##ul", b"##ump", b"##unting", b"26", b"65", b"achievement", b"actor", b"advance", b"agencies", b"altogether", b"anxiety", b"approaches", b"archimedes", b"balls", b"band", b"begun", b"besides", b"bias", b"blindness", b"breathing", b"budget", b"cage", b"calculate", b"captured", b"christmas", b"classes", b"clue", b"components", b"concerns", b"contain", b"contains", b"corals", b"depending", b"diarrhea", b"dogs", b"dozens", b"dramatically", b"earn", b"economists", b"educated", b"egyptian", b"eh", b"electrical", b"engagement", b"ethiopia", b"experiencing", b"extension", b"extinct", b"fabulous", b"factories", b"flag", b"footprint", b"foxo", b"glue", b"golden", b"ha", b"hanging", b"homework", b"incentives", b"inequality", b"instrument", b"intact", b"jewish", b"jews", b"kingdom", b"laid", b"landing", b"laugh", b"letting", b"lifespan", b"located", b"lovely", b"mail", b"manipulate", b"measures", b"medication", b"mexican", b"mix", b"mode", b"multi", b"muslim", b"muslims", b"neighbor", b"nelson", b"nerve", b"occurred", b"oldest", b"organism", b"organisms", b"ought", b"outer", b"owned", b"owner", b"panel", b"parkinson", b"parks", b"performing", b"pet", b"physicist", b"pole", b"positions", b"printed", b"produces", b"proof", b"propose", b"pure", b"react", b"receiving", b"reference", b"relative", b"rwanda", b"safely", b"saharan", b"screens", b"sebastian", b"selection", b"sequences", b"shifting", b"shocked", b"shy", b"silicon", b"situations", b"smile", b"smoke", b"spiritual", b"spoken", b"sports", b"stays", b"stranger", b"suggested", b"suit", b"tasks", b"telephone", b"terrorist", b"terrorists", b"texts", b"title", b"toy", b"traditions", b"ubiquitous", b"veterans", b"viral", b"warning", b"websites", b"yield", b"##af", b"##ash", b"##ax", b"##box", b"##cle", b"##gan", b"##gate", b"##gation", b"##head", b"##ill", b"##inate", b"##inated", b"##inating", b"##ior", b"##ips", b"##ired", b"##lize", b"##lls", b"##mi", b"##nse", b"##ole", b"##omy", b"##ony", b"##ope", b"##ore", b"##ota", b"##plied", b"##posed", b"##pression", b"##ral", b"##rap", b"##rates", b"##ridge", b"##right", b"##sions", b"##tal", b"##tric", b"##ua", b"##ught", b"##uity", b"##unted", b"##ura", b"##wing", b"##ying", b"1970s", b"99", b"academic", b"adapt", b"africans", b"arabic", b"article", b"asset", b"assets", b"astronomers", b"award", b"barrier", b"behavioral", b"biases", b"boom", b"bottle", b"buying", b"chair", b"chances", b"chapter", b"chemicals", b"collaboration", b"combined", b"comment", b"compelling", b"condoms", b"confused", b"congress", b"contemporary", b"continued", b"crack", b"darkness", b"december", b"defined", b"designs", b"dig", b"dioxide", b"directions", b"dirty", b"disabilities", b"disappeared", b"dominant", b"drew", b"dust", b"earned", b"edges", b"educate", b"empire", b"entering", b"escape", b"evil", b"evolutionary", b"excuse", b"extent", b"failing", b"fears", b"fingers", b"flew", b"forgot", b"frequency", b"gathering", b"gentlemen", b"gradually", b"grandchildren", b"grasp", b"grass", b"guns", b"happier", b"hardly", b"hello", b"highway", b"horizon", b"hormone", b"ideal", b"imagined", b"inclusive", b"interestingly", b"invention", b"lawyer", b"legacy", b"legislation", b"leverage", b"linear", b"mammals", b"mandela", b"mate", b"mathematicians", b"minority", b"monkey", b"motivated", b"neanderthals", b"ngos", b"obese", b"opens", b"originally", b"owners", b"ownership", b"painted", b"philosophy", b"plot", b"predicted", b"pretend", b"primarily", b"privilege", b"procedure", b"producing", b"programming", b"proportion", b"ptsd", b"pushing", b"quarters", b"quest", b"reaching", b"reactions", b"recording", b"reflection", b"remarkably", b"rent", b"representation", b"resilience", b"revolutions", b"rolling", b"satellites", b"scenario", b"secrets", b"selves", b"senses", b"sensors", b"slowing", b"smarter", b"smoking", b"someday", b"spanish", b"stepping", b"strategies", b"subtle", b"suitcase", b"suppose", b"survived", b"survivors", b"symbols", b"talent", b"tank", b"tax", b"tie", b"ties", b"transformed", b"trauma", b"trustworthy", b"tuberculosis", b"villages", b"volunteers", b"whatsoever", b"writers", b"##30", b"##ap", b"##cks", b"##cular", b"##den", b"##em", b"##hs", b"##ick", b"##ight", b"##ila", b"##ining", b"##kes", b"##ki", b"##late", b"##lies", b"##logist", b"##matic", b"##nating", b"##nded", b"##nding", b"##oked", b"##oration", b"##plication", b"##rous", b"##sia", b"##sist", b"##uated", b"##ub", b"##ubs", b"##uts", b"##ux", b"##vating", b"##vity", b"##zation", b"700", b"70s", b"72", b"80s", b"administration", b"adventure", b"advocate", b"airbnb", b"album", b"alien", b"andrew", b"applying", b"approved", b"archie", b"arctic", b"armed", b"array", b"artistic", b"asks", b"assignment", b"assumptions", b"attitude", b"author", b"boson", b"browser", b"buried", b"challenged", b"charges", b"charles", b"circles", b"clicks", b"coding", b"connects", b"construct", b"continuing", b"conventional", b"costa", b"darwin", b"decrease", b"demands", b"depend", b"depressed", b"detailed", b"diagnosis", b"downtown", b"dutch", b"economist", b"emerges", b"enables", b"eric", b"estimate", b"estimates", b"evolve", b"excellent", b"exhibition", b"expectations", b"facility", b"favorites", b"feature", b"fed", b"football", b"freespeech", b"fruit", b"gabby", b"gallery", b"generosity", b"greek", b"hang", b"hero", b"hormones", b"huh", b"hydrogen", b"innovations", b"insights", b"invent", b"isolated", b"joel", b"lego", b"liberal", b"limit", b"luck", b"lung", b"mary", b"meal", b"miracle", b"muscle", b"musicians", b"nets", b"neutral", b"nights", b"officer", b"operations", b"pasted", b"patience", b"petition", b"philosopher", b"photographer", b"photography", b"plate", b"player", b"pointed", b"pollen", b"pollution", b"poop", b"poster", b"pregnant", b"primate", b"primates", b"priority", b"profits", b"programmable", b"promised", b"proposed", b"protected", b"pushed", b"receive", b"reflect", b"reliable", b"request", b"resist", b"saturn", b"script", b"shouting", b"sing", b"solly", b"somewhat", b"speaks", b"spectacular", b"stack", b"steve", b"storm", b"storytelling", b"surprisingly", b"tall", b"tangible", b"tears", b"texas", b"thirds", b"threatened", b"throwing", b"tip", b"tired", b"transactions", b"transparent", b"trustworthiness", b"uber", b"underlying", b"vessels", b"visualize", b"wedding", b"wires", b"wolves", b"wood", b"##70", b"##ades", b"##ancy", b"##ang", b"##ats", b"##be", b"##che", b"##cho", b"##code", b"##ctive", b"##ection", b"##eries", b"##esis", b"##fuse", b"##han", b"##iac", b"##ilia", b"##ken", b"##key", b"##lated", b"##len", b"##lessly", b"##lessness", b"##mal", b"##mark", b"##mise", b"##mo", b"##mons", b"##more", b"##nated", b"##ncies", b"##olation", b"##ong", b"##ories", b"##pa", b"##pathy", b"##pic", b"##places", b"##ple", b"##pped", b"##rade", b"##ream", b"##rence", b"##ressive", b"##ried", b"##rily", b"##rove", b"##set", b"##sity", b"##tate", b"##ude", b"##une", b"##vasive", b"##vision", b"##water", b"2006", b"32", b"95", b"accent", b"agency", b"agricultural", b"airport", b"ambitious", b"animation", b"anniversary", b"antarctica", b"appeared", b"appropriate", b"architects", b"assistance", b"attacked", b"august", b"bat", b"beer", b"beliefs", b"belong", b"blame", b"bold", b"borrow", b"boston", b"bother", b"branch", b"bricks", b"briefly", b"calculations", b"cares", b"cast", b"cave", b"chest", b"cleaning", b"clinic", b"closing", b"clouds", b"cluster", b"colored", b"competitive", b"congo", b"connecting", b"conservation", b"constraints", b"constructed", b"consume", b"cope", b"corporate", b"corporations", b"counting", b"cow", b"creature", b"crop", b"crossing", b"crushed", b"curriculum", b"cuts", b"daf", b"dependent", b"differ", b"disabled", b"display", b"dose", b"ebola", b"ecological", b"electron", b"embracing", b"emerged", b"empowerment", b"ensure", b"equations", b"estimated", b"exception", b"explains", b"facilities", b"fate", b"fibonacci", b"finance", b"finds", b"flies", b"fluid", b"fraction", b"freely", b"frustrated", b"garage", b"garbage", b"gathered", b"gay", b"gg", b"glacier", b"grades", b"halls", b"hardware", b"healing", b"honestly", b"hurricane", b"impression", b"ingredients", b"insect", b"inspiring", b"installed", b"interior", b"invite", b"islamic", b"jesus", b"kansas", b"kick", b"kilograms", b"labs", b"laughing", b"learns", b"lied", b"lit", b"locked", b"lunch", b"males", b"managing", b"manuscript", b"mapped", b"mapping", b"mechanics", b"mechanisms", b"memorial", b"mk", b"mobility", b"mosquito", b"mountains", b"ms", b"narrow", b"nearby", b"neck", b"nest", b"netherlands", b"nigeria", b"nowadays", b"nurses", b"observations", b"occupy", b"officers", b"openness", b"opinions", b"paradigm", b"paralyzed", b"pen", b"perceptions", b"perspectives", b"pete", b"pilot", b"posters", b"powers", b"preparing", b"productive", b"profile", b"proved", b"pump", b"randomly", b"ray", b"reasoning", b"reduction", b"reflected", b"refused", b"relations", b"remained", b"replaced", b"retired", b"robotic", b"rocks", b"route", b"secular", b"selfish", b"significantly", b"singapore", b"sons", b"sounded", b"sperm", b"spots", b"stages", b"sticks", b"stolen", b"stones", b"stops", b"strongly", b"successfully", b"syringe", b"tattoos", b"tedtalk", b"thirty", b"tragic", b"translator", b"treating", b"understands", b"uniform", b"upper", b"utility", b"valley", b"venice", b"virtue", b"weak", b"whoa", b"widely", b"widespread", b"##ab", b"##ane", b"##arily", b"##berry", b"##cal", b"##ception", b"##cination", b"##city", b"##clusion", b"##ctic", b"##ders", b"##eat", b"##ect", b"##elling", b"##ement", b"##ening", b"##ep", b"##ern", b"##ert", b"##ever", b"##fted", b"##fying", b"##gain", b"##herent", b"##ibly", b"##ied", b"##ited", b"##j", b"##lish", b"##lity", b"##lt", b"##nal", b"##nder", b"##ni", b"##nic", b"##nter", b"##ones", b"##orous", b"##over", b"##ploy", b"##ployed", b"##pose", b"##position", b"##rain", b"##rawl", b"##raying", b"##ress", b"##rial", b"##rm", b"##runk", b"##rus", b"##rush", b"##tend", b"##tended", b"##tling", b"##tory", b"##ttered", b"##tude", b"##ubling", b"##ups", b"##urse", b"##ury", b"##vated", b"##ven", b"##verse", b"##ville", b"##vis", b"##vised", b"##vy", b"120", b"160", b"1998", b"2014", b"28", b"36", b"404", b"accountable", b"accuracy", b"activists", b"adopted", b"alternatives", b"appeal", b"appear", b"arrested", b"atomic", b"attractive", b"audiences", b"australian", b"authorities", b"backwards", b"balloon", b"ban", b"banks", b"barrels", b"barriers", b"bhutan", b"biodiversity", b"bite", b"bla", b"blank", b"blast", b"bob", b"bombs", b"boss", b"breaks", b"brian", b"brooklyn", b"cables", b"calculation", b"categories", b"cats", b"chosen", b"civic", b"clues", b"coca", b"cola", b"consequence", b"controls", b"county", b"couples", b"creator", b"crew", b"dancers", b"dates", b"dating", b"decline", b"dedicated", b"defend", b"deforestation", b"demonstration", b"density", b"difficulty", b"dimension", b"disgust", b"division", b"drops", b"eaten", b"egyptians", b"enabling", b"establish", b"exploitation", b"exponential", b"extend", b"fan", b"fancy", b"february", b"feeding", b"females", b"films", b"firing", b"flash", b"flowing", b"flows", b"foie", b"fortune", b"founded", b"friendly", b"fueled", b"gaining", b"gore", b"gras", b"guinea", b"habitat", b"handed", b"highlight", b"homeless", b"housing", b"hunger", b"impacts", b"improving", b"included", b"independence", b"infections", b"infectious", b"informed", b"injury", b"innovate", b"insecure", b"instruction", b"interventions", b"intimate", b"islam", b"killer", b"kitchen", b"korea", b"lawyers", b"lifted", b"limestone", b"listened", b"mainstream", b"marine", b"marks", b"martin", b"meetings", b"merely", b"millimeter", b"ministry", b"narrator", b"novelty", b"november", b"nutrients", b"obama", b"olds", b"origins", b"owns", b"permanent", b"pervasive", b"plain", b"polar", b"powered", b"presidential", b"prey", b"profession", b"projection", b"projections", b"proper", b"prosthetic", b"psychology", b"rainforests", b"rape", b"rapid", b"recognizing", b"register", b"regularly", b"relation", b"remembered", b"renewables", b"replicate", b"reporting", b"reports", b"revenue", b"ringing", b"rivers", b"rna", b"root", b"rose", b"routine", b"rush", b"sale", b"sensitive", b"shadows", b"shall", b"shine", b"shops", b"signature", b"simulation", b"slides", b"slums", b"soldier", b"soldiers", b"soviet", b"specialized", b"stomach", b"stretch", b"substance", b"suffered", b"sunday", b"supermarket", b"suspect", b"tea", b"terrified", b"terror", b"texting", b"theoretical", b"theories", b"titan", b"traditionally", b"traits", b"transit", b"transmit", b"traumatic", b"tunneling", b"twenty", b"unbelievable", b"unprecedented", b"vertical", b"visiting", b"visitors", b"visualization", b"vocal", b"watson", b"wealthy", b"wikipedia", b"william", b"winner", b"wire", b"wireless", b"workforce", b"worm", b"##12", b"##ack", b"##ag", b"##alin", b"##ason", b"##au", b"##away", b"##back", b"##beration", b"##bic", b"##bra", b"##ci", b"##cope", b"##dication", b"##duced", b"##ents", b"##ere", b"##ex", b"##gh", b"##gram", b"##gy", b"##hi", b"##hing", b"##icides", b"##iction", b"##ifying", b"##ild", b"##ines", b"##iring", b"##irs", b"##ison", b"##ith", b"##izes", b"##jo", b"##las", b"##lating", b"##lets", b"##lie", b"##lm", b"##mit", b"##mmer", b"##mond", b"##nald", b"##nces", b"##oe", b"##olate", b"##ool", b"##op", b"##pad", b"##power", b"##ptic", b"##races", b"##rams", b"##rect", b"##res", b"##rge", b"##ribute", b"##rious", b"##rise", b"##sisted", b"##sty", b"##tail", b"##take", b"##ths", b"##trating", b"##tt", b"##tting", b"##ued", b"##unction", b"##und", b"##unding", b"##urn", b"##usive", b"##uting", b"##vise", b"##wed", b"##za", b"abandoned", b"abilities", b"accomplished", b"accounts", b"advertising", b"alarm", b"alongside", b"als", b"amazed", b"analogy", b"arabia", b"architect", b"articles", b"astonishing", b"atlantic", b"attached", b"augmented", b"babylon", b"barbershop", b"basement", b"basics", b"bears", b"bike", b"biologists", b"blog", b"blowing", b"boring", b"breakthrough", b"breast", b"broader", b"cairo", b"calculus", b"canvas", b"capabilities", b"ceiling", b"chairs", b"charged", b"cheese", b"chef", b"clay", b"comfort", b"communicating", b"communications", b"constitution", b"controversial", b"cooked", b"cooperation", b"cord", b"countless", b"criminal", b"crystal", b"damn", b"deck", b"delight", b"depth", b"desk", b"despair", b"diagnostics", b"diagram", b"diffusion", b"dirt", b"distinction", b"dragline", b"elsewhere", b"emotionally", b"employment", b"engines", b"evaluation", b"evan", b"exploit", b"explosion", b"extended", b"external", b"farms", b"farther", b"fence", b"fits", b"flourish", b"fool", b"forming", b"forum", b"gained", b"givers", b"graffiti", b"gravitational", b"guests", b"historically", b"hitting", b"horror", b"hurts", b"immigrants", b"indigenous", b"insane", b"integration", b"iranian", b"iron", b"jean", b"jh", b"jihad", b"kit", b"korean", b"labeled", b"landed", b"laptop", b"lasted", b"lectures", b"libya", b"lift", b"losses", b"magical", b"magnificent", b"magnitude", b"mario", b"mastery", b"matrix", b"meanwhile", b"medications", b"metronome", b"monkeys", b"motivation", b"murder", b"na", b"obstacles", b"odysseus", b"orders", b"organizing", b"package", b"packed", b"palestine", b"panels", b"panic", b"parliament", b"partnership", b"passenger", b"paste", b"paths", b"peaceful", b"perceive", b"ph", b"phenomena", b"philosophers", b"physicists", b"pockets", b"pointing", b"practically", b"predictions", b"presence", b"previously", b"pride", b"printer", b"proposition", b"proust", b"provides", b"psychological", b"pyramid", b"queen", b"rating", b"realization", b"reef", b"regulation", b"replacing", b"reported", b"restore", b"revealed", b"ridiculous", b"risky", b"robust", b"saves", b"separated", b"shell", b"ships", b"significance", b"silks", b"silver", b"sink", b"skill", b"socially", b"spends", b"sport", b"staggering", b"sticky", b"stores", b"sunlight", b"supported", b"synapses", b"syndrome", b"tbp", b"teenager", b"terrifying", b"theirs", b"thrive", b"tide", b"tiger", b"till", b"toll", b"tomato", b"tongue", b"tour", b"transducer", b"trapped", b"trash", b"trips", b"truths", b"tube", b"tumors", b"twin", b"twins", b"unlock", b"upside", b"utterly", b"variables", b"youngest", b"##aa", b"##aded", b"##aks", b"##ards", b"##art", b"##ature", b"##aze", b"##bin", b"##bration", b"##break", b"##cast", b"##ches", b"##do", b"##door", b"##dus", b"##east", b"##eds", b"##eeks", b"##eering", b"##eral", b"##eration", b"##ession", b"##ette", b"##fied", b"##fire", b"##fish", b"##front", b"##ga", b"##hips", b"##iary", b"##iate", b"##iated", b"##icious", b"##icity", b"##ination", b"##inning", b"##ishes", b"##iss", b"##itions", b"##ius", b"##ival", b"##kers", b"##lay", b"##lia", b"##lines", b"##mes", b"##mission", b"##nap", b"##net", b"##ng", b"##nge", b"##nny", b"##og", b"##olis", b"##onia", b"##pes", b"##pet", b"##pies", b"##port", b"##pressed", b"##quity", b"##ren", b"##rged", b"##rians", b"##rified", b"##rify", b"##rim", b"##rk", b"##rming", b"##thy", b"##uating", b"##ucks", b"##ucky", b"##uctor", b"##ugh", b"##ules", b"##using", b"##vin", b"##zer", b"1945", b"1999", b"38", b"86", b"abortion", b"absence", b"abundance", b"accelerating", b"accidents", b"adam", b"adapted", b"affairs", b"affecting", b"altruism", b"ambulance", b"appears", b"approaching", b"arabs", b"arrow", b"assemble", b"associate", b"assumption", b"astronauts", b"atom", b"attempts", b"authentic", b"authority", b"automated", b"awake", b"bacterial", b"bamboo", b"beam", b"beating", b"behalf", b"bend", b"brands", b"bridges", b"budgets", b"calculated", b"canadian", b"candidates", b"cartoon", b"cathedral", b"charity", b"chile", b"chimpanzee", b"chronic", b"circuit", b"circuits", b"cochrane", b"collaborate", b"combat", b"configuration", b"confined", b"continues", b"controlling", b"correctly", b"covering", b"cube", b"curves", b"customer", b"daisy", b"dan", b"dare", b"dear", b"decent", b"decides", b"deepest", b"default", b"delicious", b"dental", b"describing", b"desperately", b"devastating", b"disagree", b"discoveries", b"discussions", b"dominated", b"dress", b"ed", b"eggs", b"electrons", b"emma", b"empowered", b"enabled", b"encouraging", b"endless", b"engineered", b"enhance", b"exercises", b"exhibit", b"exponentially", b"extroverts", b"fastest", b"feminine", b"fifty", b"file", b"findings", b"fires", b"fishing", b"fitness", b"formula", b"foster", b"geographic", b"germ", b"girlfriend", b"grandparents", b"greece", b"guarantee", b"handful", b"happily", b"harmony", b"healthier", b"heforshe", b"hemisphere", b"henry", b"herd", b"heritage", b"hierarchy", b"hire", b"histories", b"hometown", b"honey", b"hopes", b"hotels", b"household", b"hp", b"identified", b"implement", b"improved", b"inch", b"incident", b"informal", b"injured", b"investigate", b"jane", b"jaw", b"jennifer", b"jerusalem", b"judgment", b"july", b"jumped", b"jungle", b"kicked", b"kidding", b"knees", b"lifelong", b"limitations", b"locations", b"loyalty", b"luckily", b"lymph", b"malarious", b"mall", b"marked", b"masters", b"matches", b"mere", b"mic", b"microphone", b"migration", b"minus", b"missouri", b"momentum", b"mouths", b"muhammad", b"mutations", b"myth", b"neuroscience", b"nike", b"norway", b"observed", b"obsolete", b"october", b"offers", b"offs", b"onstage", b"optimism", b"oxford", b"pack", b"palestinian", b"passing", b"passions", b"pasting", b"payment", b"peoples", b"pharmaceutical", b"philadelphia", b"phrases", b"pity", b"planted", b"politician", b"poo", b"port", b"portraits", b"precise", b"prefer", b"prince", b"prior", b"programmer", b"programmers", b"propaganda", b"protecting", b"proven", b"pulling", b"pursue", b"rainforest", b"rational", b"rb", b"reader", b"reasonable", b"receptor", b"redemption", b"regardless", b"regime", b"rescue", b"responded", b"revenues", b"riding", b"russian", b"sacred", b"sandy", b"sarah", b"saturday", b"scales", b"scholars", b"seats", b"seemingly", b"sexting", b"sexually", b"sexy", b"shelf", b"shirt", b"shoot", b"shorter", b"shortly", b"silly", b"simultaneously", b"skeletons", b"smells", b"smith", b"spain", b"spheres", b"stable", b"staring", b"stealing", b"straightforward", b"strip", b"structural", b"sugar", b"supports", b"sustain", b"synthetic", b"tables", b"tale", b"tendency", b"tends", b"terribly", b"textbooks", b"timing", b"toilets", b"ton", b"transforming", b"transplant", b"trap", b"tuition", b"tunisia", b"tunnel", b"twist", b"uganda", b"upset", b"urge", b"ushahidi", b"variable", b"variations", b"velocity", b"vibrating", b"virtually", b"vivid", b"waited", b"weigh", b"whale", b"winning", b"woods", b"##20", b"##ague", b"##ams", b"##aring", b"##arity", b"##asis", b"##boards", b"##cation", b"##cting", b"##ctions", b"##dam", b"##dled", b"##ections", b"##eologists", b"##erate", b"##even", b"##fies", b"##fold", b"##fuge", b"##fulness", b"##gar", b"##ginable", b"##hade", b"##ham", b"##have", b"##he", b"##ho", b"##hop", b"##hore", b"##ially", b"##iance", b"##idal", b"##igating", b"##ima", b"##inable", b"##inary", b"##inted", b"##isms", b"##itch", b"##ivating", b"##iving", b"##laws", b"##lers", b"##li", b"##licate", b"##list", b"##mic", b"##min", b"##mm", b"##nds", b"##nied", b"##night", b"##non", b"##nster", b"##ntities", b"##ntity", b"##oids", b"##ois", b"##ok", b"##ologists", b"##oms", b"##ord", b"##oring", b"##ort", b"##pack", b"##piece", b"##play", b"##plify", b"##ptive", b"##rch", b"##rds", b"##reed", b"##ribed", b"##rifying", b"##rill", b"##robable", b"##roid", b"##ruce", b"##scopic", b"##sign", b"##sters", b"##table", b"##tee", b"##tel", b"##tly", b"##tone", b"##tra", b"##uba", b"##uct", b"##uilt", b"##uins", b"##ull", b"##unce", b"##uns", b"##urge", b"##urs", b"##ursor", b"##usion", b"##ust", b"##uted", b"##vel", b"##view", b"##ware", b"##wood", b"1984", b"2003", b"2016", b"54", b"absorb", b"absurd", b"accelerate", b"actively", b"advances", b"advent", b"afterwards", b"agent", b"agents", b"aggregate", b"alcohol", b"amazingly", b"apartments", b"applies", b"approximately", b"april", b"arranged", b"ashamed", b"assignments", b"attending", b"authenticity", b"automobile", b"autos", b"awkward", b"basketball", b"beloved", b"belt", b"bicycles", b"boundaries", b"boyfriend", b"bruno", b"brush", b"bubbles", b"bump", b"burn", b"burned", b"burst", b"calories", b"capability", b"carnegie", b"catastrophic", b"celebrity", b"channels", b"charter", b"chemist", b"chimpanzees", b"chocolate", b"citizenship", b"cleaner", b"columns", b"communal", b"comparison", b"composition", b"conor", b"consistent", b"consists", b"consuming", b"contribution", b"convincing", b"copying", b"cosmic", b"cosmos", b"counts", b"cracks", b"critically", b"crossed", b"daniel", b"deaf", b"demographic", b"demonstrate", b"demonstrated", b"determined", b"determines", b"devil", b"dh", b"diameter", b"dictionary", b"diego", b"diplomacy", b"discipline", b"disconnect", b"disconnected", b"discrimination", b"distinct", b"district", b"disturbing", b"domain", b"doodle", b"dopamine", b"download", b"drawings", b"earliest", b"emerge", b"encoder", b"encountered", b"encouraged", b"enormously", b"episode", b"faint", b"faiza", b"fans", b"fathers", b"federal", b"feminist", b"finger", b"foods", b"footage", b"friction", b"frightening", b"fuzzy", b"gallon", b"gear", b"generic", b"glad", b"gods", b"gon", b"goodness", b"grammar", b"grant", b"graphic", b"guitar", b"habitats", b"hamburger", b"hannah", b"hat", b"headlines", b"hebrew", b"heroic", b"hired", b"historic", b"hollywood", b"holy", b"honesty", b"honeybees", b"ill", b"illnesses", b"imagining", b"immense", b"immigrant", b"implemented", b"impressive", b"improvements", b"includes", b"incomes", b"index", b"indonesia", b"initially", b"intangible", b"intense", b"intriguing", b"intuition", b"ironic", b"italy", b"jam", b"johnny", b"joshua", b"journalism", b"jupiter", b"kidney", b"kilos", b"knife", b"label", b"laboratories", b"lander", b"lane", b"lanes", b"liberia", b"liberty", b"links", b"littlebits", b"lowest", b"lungs", b"magnetic", b"maker", b"manuscripts", b"microsoft", b"min", b"mining", b"mohammed", b"mood", b"motors", b"mum", b"museums", b"naive", b"neanderthal", b"negotiate", b"newly", b"nl", b"nurse", b"oecd", b"offering", b"ongoing", b"opposed", b"orbit", b"overnight", b"pacific", b"parasite", b"participants", b"passes", b"pathway", b"pathways", b"payments", b"peculiar", b"periods", b"persuade", b"philosophical", b"pigeon", b"placing", b"planting", b"plug", b"pollination", b"popcorn", b"portion", b"positioning", b"pound", b"practicing", b"precious", b"predictable", b"pregnancy", b"presentation", b"prevented", b"principal", b"prisons", b"privileged", b"probability", b"promote", b"protests", b"psychiatric", b"psychologist", b"pursuit", b"qualities", b"ratings", b"ratio", b"realities", b"recall", b"recommend", b"reconciliation", b"refuse", b"reinvent", b"relatives", b"relief", b"reporter", b"reporters", b"retire", b"rezero", b"romantic", b"rubble", b"salary", b"sarajevo", b"secretary", b"seduction", b"sends", b"shaking", b"shut", b"sighted", b"singers", b"skull", b"skype", b"slavery", b"slice", b"smooth", b"socioeconomic", b"somalia", b"sooner", b"sp", b"spacecraft", b"squares", b"st", b"statistical", b"steal", b"steam", b"stimulation", b"stopping", b"storycorps", b"striking", b"strings", b"succeeded", b"sufficient", b"supreme", b"switched", b"sydney", b"tablet", b"tag", b"teams", b"teen", b"temple", b"tenth", b"territory", b"textbook", b"threshold", b"threw", b"thrilled", b"thumb", b"thus", b"tim", b"tone", b"trains", b"translated", b"tries", b"trigger", b"tsunami", b"tweet", b"tweets", b"ultra", b"underground", b"unhappy", b"unintended", b"urbanization", b"urgent", b"useless", b"victorian", b"vietnam", b"virgin", b"vivian", b"weighs", b"whales", b"wheels", b"whoever", b"width", b"wings", b"wins", b"wise", b"witnessed", b"wives", b"worms", b"wrist", b"yard", b"##60", b"##79", b"##ac", b"##ading", b"##ains", b"##aked", b"##akers", b"##amination", b"##arl", b"##asant", b"##bc", b"##berate", b"##brate", b"##care", b"##cha", b"##cial", b"##cid", b"##cipe", b"##cities", b"##cles", b"##clusive", b"##dance", b"##date", b"##dden", b"##dder", b"##dor", b"##duce", b"##ductive", b"##ec", b"##elor", b"##ends", b"##eology", b"##eric", b"##fusion", b"##gers", b"##graded", b"##gue", b"##hew", b"##hy", b"##iation", b"##icial", b"##igorous", b"##imate", b"##imeter", b"##istrust", b"##istry", b"##itimate", b"##iting", b"##itors", b"##ker", b"##legic", b"##lete", b"##listic", b"##lix", b"##lon", b"##los", b"##lum", b"##makers", b"##marks", b"##mates", b"##mented", b"##metric", b"##metry", b"##minate", b"##mine", b"##nial", b"##obe", b"##ods", b"##oke", b"##ola", b"##omical", b"##onal", b"##oning", b"##ots", b"##oyal", b"##pass", b"##ph", b"##pia", b"##print", b"##ption", b"##pus", b"##q", b"##rass", b"##rassed", b"##ribe", b"##rip", b"##rist", b"##rit", b"##rize", b"##rosion", b"##round", b"##scent", b"##sta", b"##state", b"##sy", b"##test", b"##tification", b"##tles", b"##uard", b"##uded", b"##uel", b"##ues", b"##ui", b"##uistic", b"##ulous", b"##urable", b"##uration", b"##urfing", b"##urity", b"##uum", b"##yan", b"##yl", b"##zar", b"18th", b"1960s", b"1980s", b"1989", b"1990", b"55", b"85", b"abundant", b"accenture", b"acceptable", b"accountability", b"accounting", b"acids", b"activism", b"actors", b"adopt", b"ag", b"aged", b"alex", b"alexander", b"allies", b"amen", b"analog", b"angles", b"antibiotic", b"antibiotics", b"arguments", b"arrive", b"asian", b"asphalt", b"assigned", b"ate", b"atlanta", b"attacker", b"attracted", b"aunt", b"backed", b"bacterium", b"bake", b"barrel", b"bars", b"batteries", b"bay", b"bc", b"beaten", b"believes", b"belongs", b"beneath", b"bites", b"blown", b"boil", b"boltzmann", b"bound", b"brave", b"brief", b"bronx", b"bug", b"builds", b"campaigns", b"campus", b"cared", b"carol", b"catholic", b"celebrated", b"cerebral", b"certificate", b"chains", b"characteristics", b"chasing", b"checked", b"circuitry", b"claimed", b"classmates", b"cleaned", b"clients", b"closest", b"cloth", b"coined", b"collaborator", b"commit", b"compassionate", b"conditioning", b"connectomes", b"conservative", b"conspiracy", b"contracts", b"convenience", b"counted", b"cousins", b"criteria", b"cruel", b"cues", b"curator", b"dangerously", b"daughters", b"deer", b"denial", b"deserts", b"desperate", b"destination", b"determination", b"devastated", b"difficulties", b"digging", b"directed", b"displaced", b"dive", b"documented", b"documenting", b"donna", b"dropping", b"ease", b"educators", b"egg", b"el", b"emails", b"encoded", b"encounter", b"enforcement", b"entered", b"enters", b"enzymes", b"equilibrium", b"ethical", b"excitement", b"exhausted", b"exile", b"exit", b"expecting", b"expressed", b"extinction", b"famously", b"favor", b"fixing", b"fluctuations", b"fluent", b"folded", b"foundations", b"fragile", b"friendship", b"fulfill", b"functioning", b"funds", b"gandhi", b"gaps", b"genocide", b"glaciers", b"governance", b"gowanus", b"guards", b"guilt", b"hackers", b"halfway", b"harry", b"harvest", b"heaven", b"highways", b"hip", b"holmes", b"honduras", b"hop", b"hunter", b"iceberg", b"idiot", b"illustrate", b"imagery", b"indicate", b"inherently", b"initiative", b"injuries", b"innovators", b"inorganic", b"install", b"instinct", b"institutional", b"intention", b"intentions", b"interpret", b"interviews", b"intrigued", b"inventing", b"inventor", b"investigation", b"investing", b"invisibility", b"irrigation", b"journal", b"jr", b"keys", b"lamb", b"laser", b"lee", b"lhc", b"limbs", b"livestock", b"loans", b"logical", b"lolcats", b"looting", b"louder", b"manner", b"marduk", b"maria", b"marriages", b"marrow", b"meerkats", b"membrane", b"messy", b"metric", b"millennia", b"mimic", b"mirrors", b"mm", b"mo", b"monitoring", b"moreover", b"newspapers", b"noises", b"nonetheless", b"obtained", b"officials", b"operated", b"orb", b"orbiting", b"originals", b"outward", b"pan", b"parallel", b"participating", b"partly", b"pee", b"phenomenal", b"phoenix", b"plasma", b"players", b"polarization", b"politically", b"prayer", b"prediction", b"princess", b"procedures", b"professors", b"programmed", b"promising", b"prosthetics", b"protest", b"psychotic", b"punishment", b"purely", b"purple", b"radically", b"reaches", b"reactor", b"realistic", b"rebuilding", b"recession", b"recovered", b"reduces", b"refer", b"refugee", b"reinventing", b"reject", b"reminds", b"repeated", b"researcher", b"reserve", b"resilient", b"responds", b"reuse", b"revolutionary", b"rises", b"roger", b"rough", b"rubber", b"sake", b"salt", b"scaling", b"scholar", b"scream", b"screening", b"select", b"selected", b"senior", b"server", b"session", b"settled", b"settlement", b"sheets", b"shifted", b"shifts", b"shining", b"shower", b"sierra", b"simpler", b"sized", b"skeptical", b"slower", b"smartphone", b"sms", b"snow", b"spark", b"specimens", b"squared", b"stake", b"stigma", b"stockholm", b"storage", b"stressed", b"stretched", b"struggled", b"stuxnet", b"supporting", b"surgeries", b"surgical", b"surplus", b"surveys", b"surviving", b"systematically", b"tagged", b"talented", b"tanks", b"tehran", b"temporary", b"tenderness", b"threatening", b"thriving", b"toes", b"tolerance", b"touches", b"touching", b"towns", b"toxic", b"trafficking", b"transferred", b"tribal", b"tribe", b"trillions", b"trivial", b"typing", b"un", b"uploaded", b"ups", b"vacation", b"verbal", b"viewed", b"viewing", b"volunteer", b"voyager", b"wasted", b"wine", b"wing", b"wound", b"wrapped", b"yale", b"yr", b"zimbabwe", b"##13", b"##40", b"##85", b"##aid", b"##ainable", b"##alized", b"##ament", b"##ao", b"##arse", b"##ases", b"##ashed", b"##atch", b"##att", b"##avor", b"##aw", b"##azz", b"##bbie", b"##bian", b"##bled", b"##bm", b"##born", b"##bow", b"##cidence", b"##cinations", b"##cist", b"##claim", b"##coming", b"##con", b"##crew", b"##cs", b"##ctuation", b"##cture", b"##dar", b"##dest", b"##dic", b"##diction", b"##die", b"##dified", b"##doors", b"##down", b"##ees", b"##efficient", b"##eg", b"##elves", b"##ensation", b"##ensible", b"##eological", b"##eous", b"##eping", b"##ese", b"##estimate", b"##eze", b"##fa", b"##faction", b"##fall", b"##fes", b"##ffle", b"##fic", b"##field", b"##force", b"##gic", b"##give", b"##giving", b"##gle", b"##graphy", b"##ground", b"##harks", b"##heads", b"##heat", b"##here", b"##hold", b"##hou", b"##ials", b"##iaries", b"##iating", b"##ich", b"##icted", b"##ides", b"##ilation", b"##ilized", b"##imal", b"##iness", b"##ingly", b"##ini", b"##iny", b"##iously", b"##iped", b"##ipes", b"##isa", b"##istress", b"##itative", b"##itched", b"##iture", b"##ivable", b"##ivate", b"##ivor", b"##ix", b"##lain", b"##lash", b"##lave", b"##lee", b"##lizing", b"##lly", b"##mart", b"##mas", b"##mate", b"##meter", b"##mill", b"##moving", b"##nging", b"##nity", b"##nna", b"##not", b"##ntly", b"##oc", b"##ographic", b"##ologically", b"##olving", b"##omic", b"##ood", b"##ops", b"##orate", b"##oric", b"##oss", b"##ought", b"##ows", b"##passed", b"##phone", b"##plete", b"##plicitly", b"##plicity", b"##pling", b"##ploded", b"##ply", b"##posal", b"##production", b"##quez", b"##rading", b"##rals", b"##rands", b"##ras", b"##rations", b"##read", b"##rection", b"##requent", b"##rest", b"##ret", b"##rick", b"##ride", b"##riving", b"##rture", b"##rude", b"##ship", b"##sional", b"##stitute", b"##structive", b"##style", b"##sume", b"##suming", b"##tach", b"##tar", b"##tarian", b"##tch", b"##tent", b"##think", b"##tical", b"##tie", b"##uana", b"##ubble", b"##ucted", b"##uding", b"##uiz", b"##ulsion", b"##ulus", b"##umble", b"##umbling", b"##umen", b"##ums", b"##unes", b"##unts", b"##uo", b"##urbs", b"##usable", b"##vans", b"##vator", b"##venient", b"##version", b"##vine", b"##vio", b"##vitation", b"##wind", b"##ws", b"##xed", b"##xious", b"##ym", b"##zone", b"##zzing", b"1900", b"1930s", b"1950", b"1980", b"1991", b"1995", b"2002", b"48", b"60s", b"abused", b"accurately", b"acknowledge", b"ad", b"advantages", b"adventures", b"afghan", b"aiming", b"ak", b"alter", b"ambition", b"amino", b"angels", b"animated", b"annoying", b"answering", b"appreciation", b"argentina", b"assembled", b"assembling", b"associations", b"asylum", b"attract", b"attributes", b"automatic", b"backyard", b"bali", b"battles", b"bedroom", b"ben", b"bimetal", b"biologist", b"bizarre", b"blew", b"blocking", b"boiling", b"bonds", b"bonica", b"boundary", b"brick", b"bucks", b"butler", b"buttons", b"calculator", b"camps", b"captain", b"chad", b"checks", b"chernobyl", b"choosing", b"christian", b"cinema", b"clothing", b"clusters", b"collaborators", b"collapsed", b"collision", b"colorful", b"coma", b"comparing", b"competing", b"complained", b"conductive", b"confession", b"confronted", b"consciously", b"container", b"continents", b"continuous", b"copied", b"corrupt", b"couch", b"coworkers", b"cream", b"creatively", b"crimes", b"cultivate", b"da", b"dances", b"database", b"daycare", b"defending", b"delighted", b"describes", b"deserves", b"destroying", b"diabetes", b"diagnostic", b"dinosaur", b"disrupt", b"distances", b"domingos", b"draws", b"dude", b"dvd", b"dynamics", b"eager", b"editing", b"editor", b"editors", b"elementary", b"embraced", b"emergence", b"empower", b"enemies", b"enjoying", b"entrepreneurial", b"ethic", b"evaluating", b"evolving", b"ex", b"exhausting", b"expanded", b"experimental", b"expressing", b"facilitate", b"fails", b"fantasy", b"fashioned", b"filming", b"filmmaker", b"fingerprint", b"fingertips", b"firm", b"fled", b"flexibility", b"florence", b"fluorescence", b"fog", b"fooled", b"format", b"formation", b"fracking", b"frames", b"fraud", b"fright", b"ft", b"futureless", b"gasoline", b"genetically", b"geometry", b"gigantic", b"glimpse", b"golf", b"grading", b"grains", b"grandma", b"graphics", b"gratitude", b"greenhouse", b"gross", b"guided", b"gulf", b"hacking", b"hal", b"hardest", b"harmful", b"heavens", b"heel", b"hence", b"hill", b"hoped", b"hug", b"hybrid", b"ian", b"icons", b"identities", b"imperative", b"implanted", b"implication", b"impose", b"impressed", b"inadequate", b"inches", b"indoors", b"infect", b"inform", b"integrity", b"intensive", b"intricate", b"intrinsic", b"introverts", b"irony", b"italian", b"jb", b"jealous", b"jeff", b"jet", b"joining", b"josh", b"kepler", b"kills", b"kiss", b"koran", b"lastly", b"laughed", b"le", b"leaning", b"lethal", b"limitation", b"lion", b"liver", b"loan", b"lobby", b"logistics", b"longing", b"magnets", b"mankind", b"manual", b"margins", b"maslow", b"maximum", b"measurement", b"measurements", b"meets", b"messaging", b"metabolic", b"milky", b"millennium", b"minded", b"miners", b"modest", b"mud", b"multiply", b"muscles", b"mutation", b"mysterious", b"nanotechnology", b"newton", b"ngo", b"nickname", b"nightmare", b"noisy", b"noticing", b"numeracy", b"nutritious", b"occurring", b"olympics", b"opec", b"opera", b"operator", b"org", b"origami", b"overseas", b"packaged", b"palm", b"pants", b"paralysis", b"parenting", b"passively", b"perceived", b"phd", b"physicians", b"pie", b"pill", b"pipe", b"piracy", b"pizza", b"planes", b"platforms", b"polls", b"pops", b"portrait", b"portuguese", b"posed", b"poses", b"pot", b"potter", b"pour", b"pouring", b"prioritize", b"problematic", b"processed", b"profoundly", b"prone", b"protocol", b"publish", b"publishing", b"pulse", b"punched", b"raises", b"reacting", b"realizing", b"records", b"recruit", b"rectangle", b"reducing", b"reefs", b"regions", b"regulations", b"reinforced", b"relentless", b"remaining", b"renaissance", b"replacement", b"requirement", b"requirements", b"researching", b"resistant", b"responsibilities", b"restaurants", b"restricted", b"revenge", b"rides", b"robert", b"rockets", b"roles", b"rolled", b"roofs", b"rooted", b"rosetta", b"scares", b"scotland", b"seas", b"sections", b"sectors", b"sensing", b"sensory", b"serving", b"shares", b"shocking", b"shoe", b"shots", b"shout", b"shrink", b"sidewalk", b"slope", b"smallest", b"soap", b"spare", b"specialist", b"speeding", b"spill", b"spiral", b"spoofer", b"spreads", b"stakes", b"stephen", b"stepped", b"strike", b"strive", b"suite", b"supplies", b"sustained", b"symbolic", b"tanzania", b"targeted", b"taskrabbit", b"taxi", b"temples", b"temporal", b"terrific", b"theft", b"thermal", b"thermo", b"throat", b"tick", b"tight", b"tk", b"toast", b"topics", b"touched", b"trailer", b"transfers", b"transformative", b"transmission", b"transported", b"tricky", b"troll", b"tropical", b"ubuntu", b"uk", b"ukraine", b"undergraduate", b"unpredictable", b"vantage", b"vary", b"veteran", b"vice", b"virginia", b"virtues", b"visited", b"vocabulary", b"volumes", b"volunteered", b"voted", b"vr", b"wash", b"weaving", b"wheat", b"woke", b"wooden", b"zealand", b"##!", b"###", b"##$", b"##%", b"##&", b"##'", b"##(", b"##)", b"##+", b"##,", b"##-", b"##.", b"##/", b"##:", b"##;", b"##=", b"##?", b"##@", b"##[", b"##]", b"##^", b"##_", b"##`",
]

## continue

In [201]:
_START_TOKEN = _VOCAB.index(b"[CLS]")
_END_TOKEN = _VOCAB.index(b"[SEP]")
_MASK_TOKEN = _VOCAB.index(b"[MASK]")
_RANDOM_TOKEN = _VOCAB.index(b"[RANDOM]")
_UNK_TOKEN = _VOCAB.index(b"[UNK]")
_MAX_SEQ_LEN = 8
_MAX_PREDICTIONS_PER_BATCH = 5

_VOCAB_SIZE = len(_VOCAB)

lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(
      keys=_VOCAB,
      key_dtype=tf.string,
      values=tf.range(
          tf.size(_VOCAB, out_type=tf.int64), dtype=tf.int64),
          value_dtype=tf.int64
        ),
      num_oov_buckets=1,
)

In [272]:
subtokenizer = tf_text.WhitespaceTokenizer()
tokenizer = tf_text.BertTokenizer(lookup_table, token_out_type=tf.string)

In [273]:
subtokens = subtokenizer.tokenize(['everything not saved will be lost.'])
tokens = tokenizer.tokenize(subtokens)

In [277]:
print(tokens.to_list())

[[[[b'everything']], [[b'not']], [[b'saved']], [[b'will']], [[b'be']], [[b'lost'], [b'.']]]]


In [None]:
[[[[b'everything']], [[b'not']], [[b'saved']], [[b'will']], [[b'be']],
  [[b'lost'],
   [b'.']]   ]]

In [263]:
# the padding is done automatically after the tensor 
tensor_tokens = tf.RaggedTensor.to_tensor(tokens)

In [264]:
tensor_tokens.shape

TensorShape([1000, 816, 3, 20])

In [246]:
# Import AutoTokenizer and create tokenizer object
from transformers import AutoTokenizer
checkpoint = 'bert-base-cased'
tokernizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<00:00, 307kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 3.46MB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.08MB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.44MB/s]


In [247]:
tokernizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [248]:
tokernizer('hello world')

{'input_ids': [101, 19082, 1362, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

## example

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
import os
import re
import nltk
import requests

In [21]:
import tensorflow as tf
import tensorflow_text as tf_text

In [22]:
_START_TOKEN = _VOCAB.index(b"[CLS]")
_END_TOKEN = _VOCAB.index(b"[SEP]")
_MASK_TOKEN = _VOCAB.index(b"[MASK]")
_RANDOM_TOKEN = _VOCAB.index(b"[RANDOM]")
_UNK_TOKEN = _VOCAB.index(b"[UNK]")
_MAX_SEQ_LEN = 8
_MAX_PREDICTIONS_PER_BATCH = 5

_VOCAB_SIZE = len(_VOCAB)

lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(
      keys=_VOCAB,
      key_dtype=tf.string,
      values=tf.range(
          tf.size(_VOCAB, out_type=tf.int64), dtype=tf.int64),
          value_dtype=tf.int64
        ),
      num_oov_buckets=1,
)

In [26]:
subtokenizer = tf_text.WhitespaceTokenizer()
tokenizer = tf_text.BertTokenizer(lookup_table, token_out_type=tf.string)

subtokens = subtokenizer.tokenize(clean_data[0])
tokens = tokenizer.tokenize(subtokens)

In [28]:
tokens = tokens.merge_dims(-2,-1)
ids = tf.ragged.map_flat_values(lookup_table, tokens)

TypeError: 'StaticVocabularyTable' object is not callable

In [49]:
# preprocessing

data = {
    'data': clean_data[0]
}

dataset = tf.data.Dataset.from_tensor_slices(data)

In [68]:
bert_tokenizer = tf_text.BertTokenizer(lookup_table, token_out_type=tf.int64)
segment_a = bert_tokenizer.tokenize(data["data"])

In [69]:
segment_a = segment_a.merge_dims(-2, -1)

In [72]:
random_selector = tf_text.RandomItemSelector(
    max_selections_per_batch=_MAX_PREDICTIONS_PER_BATCH,
    selection_rate=0.2,
    unselectable_ids=[_START_TOKEN, _END_TOKEN, _UNK_TOKEN]
)
selected = random_selector.get_selection_mask(
    segment_a, axis=1)

In [73]:
mask_values_chooser = tf_text.MaskValuesChooser(_VOCAB_SIZE, _MASK_TOKEN, 0.8)
mask_values_chooser.get_mask_values(segment_a)

<tf.RaggedTensor [[285, 1, 1, 1, 1, 1, 1, 4695, 1, 1],
 [5168, 1, 1, 4174, 1, 6354, 1, 1, 1, 1, 3113, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 84, 1, 1, 1, 1, 4853, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24,
  1370, 1194, 1, 25, 1, 1, 5708]                                            ,
 [1], [448],
 [1, 990, 1, 1, 1108, 1, 1, 1, 2402, 1, 4888, 1, 1, 1, 5024, 3078, 6799, 1,
  1, 1, 1, 1, 1, 1, 1730, 1, 1, 1, 1857, 1, 4436, 1, 1, 1, 507, 2200, 372,
  1666, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 639, 1, 1, 1, 1, 1, 1201, 1]      ,
 [1, 1, 1, 1, 1, 1, 1, 922, 1, 1, 1], [201, 1, 1121, 4821, 1], [1, 1],
 [244, 1, 1, 1, 1, 1, 1, 1, 1, 1], [632, 1, 1, 1],
 [1, 1, 1, 1, 1, 174, 1, 980, 1, 1, 123, 146, 1, 1, 5903, 1028, 1, 1, 1, 1,
  2622, 1]                                                                 ,
 [1, 58, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 174, 1, 1177],
 [1, 1, 2327, 1, 1, 4566, 1, 1, 4126, 1, 1, 1, 1, 1, 1, 1, 1],
 [5124, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 236, 2626, 1, 1, 1, 1,

In [75]:
masked_token_ids, masked_pos, masked_lm_ids = tf_text.mask_language_model(
  segment_a, item_selector=random_selector, mask_values_chooser=mask_values_chooser)

In [90]:
type(masked_lm_ids)

tensorflow.python.framework.ops.EagerTensor

In [92]:
# Prepare and pad combined segment inputs

masked_lm_ids_ragged = tf.RaggedTensor.from_tensor(masked_lm_ids)

input_word_ids, input_mask = tf_text.pad_model_inputs(
  masked_token_ids, max_seq_length=_MAX_SEQ_LEN)

# Prepare and pad masking task inputs
masked_lm_positions, masked_lm_weights = tf_text.pad_model_inputs(
  masked_pos, max_seq_length=_MAX_PREDICTIONS_PER_BATCH)
masked_lm_ids_ragged, _ = tf_text.pad_model_inputs(
  masked_lm_ids_ragged, max_seq_length=_MAX_PREDICTIONS_PER_BATCH)

model_inputs = {
    "input_word_ids": input_word_ids,
    "input_mask": input_mask,
    "masked_lm_ids_ragged": masked_lm_ids_ragged,
    "masked_lm_positions": masked_lm_positions,
    "masked_lm_weights": masked_lm_weights,
}

In [94]:
import functools

dataset = (
    tf.data.Dataset.from_tensors(data)
    .map(functools.partial(bert_pretrain_preprocess, lookup_table))
)

next(iter(dataset))

NameError: name 'bert_pretrain_preprocess' is not defined

## example

In [95]:
def _CreateTable(vocab, num_oov=1):
    init = tf.lookup.KeyValueTensorInitializer(
        vocab,
        tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64),
        key_dtype=tf.string,
        value_dtype=tf.int64)
    return tf.lookup.StaticVocabularyTable(
        init, num_oov, lookup_key_dtype=tf.string)

In [96]:
## loading the data without validation set (maybe in a second example)

# training and testing data
with open('X_train.txt') as train, open('X_test.txt') as test:
    X_train = train.read()
    X_test = test.read()

# training output
y_train = np.load('y_train.npy')

In [97]:
## cleaning the data

stop = nltk.corpus.stopwords.words('english')

# inputing the stopwords in the english vocab using regular expressions for avoiding for loops 
pattern = r'\b(?:' + '\s*|'.join(map(re.escape, stop)) + r')\b' + r'|[^\w\s]\s*'

clean_data = [None]
for _ in range(1):
    clean_data[0] = X_train.lower().split('\n')
    clean_data[0] = list(map(lambda x: re.sub(pattern, '', x), clean_data[0]))

In [102]:
train_x = tf.constant(X_train)
train_y = tf.constant(y_train.flatten())

In [103]:
subtokenizer = tf_text.WhitespaceTokenizer()
tokenizer = tf_text.BertTokenizer(lookup_table, token_out_type=tf.string)

subtokens = subtokenizer.tokenize(clean_data[0])
tokens = tokenizer.tokenize(subtokens)

In [119]:
tokens_list = tokens.merge_dims(-2, -1)
tokens_list = tokens_list.to_list()

In [125]:
from itertools import chain

def flatten_chain(element):
    return list(chain.from_iterable(element))

tokens_flat = flatten_chain(flatten_chain(tokens_list))


In [None]:
def preprocess(data, labels):
    t = tf_text.WhitespaceTokenizer()
    data = t.tokenize(data)
    data = data.merge_dims(-2,-1)
    ids = tf.ragged.map_flat_values(a.lookup, data)
    return (ids, labels)

train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)).batch(2)
train_dataset = train_dataset.map(preprocess)

In [134]:
train_x = tf.constant(clean_data[0])
train_y = tf.constant(y_train)
tf.data.Dataset.from_tensor_slices((train_x, train_y)).batch(2)

<_BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 1), dtype=tf.int64, name=None))>

In [None]:
def stopWord(text):
    stop = nltk.corpus.stopwords.words('english')

    # inputing the stopwords in the english vocab using regular expressions for avoiding for loops 
    pattern = r'\s*\b(?:' + '|'.join(map(re.escape, stop)) + r')\b' + r'|[^\w\s]'

    clean_data = [None]
    for _ in range(1):
        clean_data[0] = text.lower().split('\n')
        clean_data[0] = list(map(lambda x: re.sub(pattern, '', 
                                                x.replace("'", '"')), clean_data[0]))
        
    return clean_data[0]

In [None]:
text = "This is a sample text with some stop words."
stopWord(text)

[' sample text stop words']