<a href="https://colab.research.google.com/github/jamalex/alignment-algorithms/blob/master/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found, using CPU')
else:
    print('Found GPU at: {}'.format(device_name))
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

Found GPU at: /device:GPU:0


In [3]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

from tensorflow.keras.layers import dot, Dense, Input, Lambda
from tensorflow.keras.models import Model

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# Import the Universal Sentence Encoder's TF Hub module
usenet_module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed"
usenet_embed = hub.Module(usenet_module_url)

session = tf.Session()
session.run([tf.global_variables_initializer(), tf.tables_initializer()])

# using placeholder and slotting data into it speeds up inference by orders of magnitude:
# SEE: https://github.com/tensorflow/hub/blob/master/docs/common_issues.md#running-inference-on-a-pre-initialized-module
text_input = tf.placeholder(dtype=tf.string, shape=[None])
embedded_text = usenet_embed(text_input)


def universal_sentence_encoder(messages):
    return session.run(embedded_text, feed_dict={text_input: messages})


def preprocess(entries, objectives=[]):
    """Takes a list of curricular entries and objectives, and produces an array of
    vectors to input into the encoding model.
    TODO: rather than a list of strings, these should be rows from the dumped CSV.
    """
    return universal_sentence_encoder(entries)


# The preprocessing output size (TODO: compute with sample data)
PREPROCESSED_SIZE = 512

# Encoding model: take preprocessed inputs and generate embeddings.
item = Input(shape=(PREPROCESSED_SIZE,), dtype="float32", name="item")
encoded = Dense(16, activation="relu")(item)
encoder = Model(inputs=item, outputs=encoded)
# Retrieve the encoder output size
ENCODED_SIZE = encoder.output.shape[1].value

# Relevance model: Encode two entries and compute cosine similarity of embeddings
item_a = Input(shape=(PREPROCESSED_SIZE,), dtype="float32", name="item_a")
item_b = Input(shape=(PREPROCESSED_SIZE,), dtype="float32", name="item_b")
encoded_a = encoder(item_a)
encoded_b = encoder(item_b)
cosine = dot(
    [
        tf.reshape(encoded_a, (1, ENCODED_SIZE)),
        tf.reshape(encoded_b, (1, ENCODED_SIZE)),
    ],
    axes=1,
    normalize=True,
    name="cosine_similarity",
)
relevance = Model(inputs=[item_a, item_b], outputs=[cosine])


def get_relevance(row1, row2):
    input1, input2 = preprocess([row1, row2])
    score = session.run(relevance(inputs=[input1, input2]))
    return float(score)


def train(entries, judgments, objectives=[]):
    x = preprocess(entries, objectives=objectives)
    y = judgments
    encoder.fit(x=x, y=y, batch_size=32, epochs=3, validation_split=0.1, shuffle=True)


titles = [
    "citizen kane",
    "all about eve",
    "singin' in the rain",
    "the maltese falcon",
    "the adventures of robin hood",
    "rear window",
    "seven samurai (shichinin no samurai)",
    "the treasure of the sierra madre",
    "12 angry men (twelve angry men)",
    "the 400 blows (les quatre cents coups)",
    "tokyo story (tôkyô monogatari)",
    "the grapes of wrath",
    "toy story",
    "the terminator",
    "playtime",
    "the third man",
    "metropolis",
    "north by northwest",
    "selma",
    "rosemary's baby",
    "chinatown",
    "aliens",
    "lady bird",
    "the wizard of oz",
    "get out",
    "inside out",
    "moonlight",
    "e.t. the extra-terrestrial",
    "it happened one night",
    "the godfather",
    "a hard day's night",
    "king kong",
    "snow white and the seven dwarfs",
    "sunset boulevard",
    "bicycle thieves (ladri di biciclette) (the bicycle thief)",
    "taxi driver",
    "dr. strangelove or how i learned to stop worrying and love the bomb",
    "lawrence of arabia",
    "on the waterfront",
    "roman holiday",
    "the searchers",
    "let the right one in",
    "the french connection",
    "city lights",
    "the rules of the game (la règle du jeu)",
    "badlands",
    "the manchurian candidate",
    "pather panchali",
    "forbidden planet",
    "the player",
    "evil dead 2: dead by dawn",
    "repo man",
    "say anything...",
    "jaws",
    "black panther",
    "mad max: fury road",
    "casablanca",
    "psycho",
    "alien",
    "the good, the bad and the ugly",
    "the godfather, part ii",
    "monty python and the holy grail",
    "la dolce vita",
    "double indemnity",
    "schindler's list",
    "annie hall",
    "apocalypse now",
    "breathless",
    "airplane!",
    "the princess bride",
    "ghostbusters (1984 original)",
    "goldfinger",
    "some like it hot",
    "crouching tiger, hidden dragon",
    "once",
    "spirited away",
    "spider-man: into the spider-verse",
    "don't look now",
    "vertigo",
    "high noon",
    "the red shoes",
    "the best years of our lives",
    "goodfellas",
    "back to the future",
    "solaris",
    "groundhog day",
    "the silence of the lambs",
    "boyz n the hood",
    "ghost in the shell",
    "star wars: episode v - the empire strikes back",
    "pan's labyrinth",
    "the bridge on the river kwai",
    "raging bull",
    "lost in translation",
    "raiders of the lost ark",
    "l'avventura",
    "this is spinal tap",
    "monsoon wedding",
    "the adventures of priscilla, queen of the desert",
    "the dark knight",
    "carol",
    "casino royale",
    "the apartment",
    "one flew over the cuckoo's nest",
    "west side story",
    "heathers",
    "the diary of a teenage girl",
    "wonder woman",
    "star wars: episode iv - a new hope",
    "it's a wonderful life",
    "iron man",
    "2001: a space odyssey",
    "godzilla (gojira)",
    "the lion king",
    "amadeus",
    "saving private ryan",
    "enter the dragon",
    "eternal sunshine of the spotless mind",
    "rocky",
    "all the president's men",
    "die hard",
    "fargo",
    "terminator 2: judgment day",
    "boogie nights",
    "gone with the wind",
    "duck soup",
    "drive",
    "children of men",
    "pulp fiction",
    "shaun of the dead",
    "the general",
    "do the right thing",
    "y tu mama tambien",
    "the piano",
    "jurassic park",
    "zero dark thirty",
    "slumdog millionaire",
    "the lord of the rings: the fellowship of the ring",
    "dazed and confused",
    "to kill a mockingbird",
    "national lampoon's animal house",
    "blazing saddles",
    "cidade de deus (city of god)",
    "the shawshank redemption",
    "fish tank",
    "crazy rich asians",
    "blade runner",
    "bridesmaids",
    "harry potter and the prisoner of azkaban",
    "cinema paradiso (nuovo cinema paradiso)",
    "a clockwork orange",
    "butch cassidy and the sundance kid",
    "trainspotting",
    "edward scissorhands",
    "in the mood for love",
    "titanic",
    "almost famous",
    "amélie",
    "robocop",
    "the usual suspects",
    "the breakfast club",
    "shrek",
    "the matrix",
    "boys don't cry",
    "clerks",
    "akira",
    "inception",
    "brokeback mountain",
    "easy rider",
    "superbad",
    "the shining",
    "the naked gun: from the files of police squad!",
    "the sixth sense",
    "heat",
    "john wick",
    "the sound of music",
    "the exorcist",
    "wendy and lucy",
    "harold and maude",
    "wayne's world",
    "thelma & louise",
    "the thing",
    "mean girls",
    "mulholland drive",
    "lethal weapon",
    "the big lebowski",
    "gattaca",
    "love and basketball",
    "scarface",
    "seven (se7en)",
    "oldboy",
    "better luck tomorrow",
    "the royal tenenbaums",
    "rumble in the bronx",
    "fight club",
    "clueless",
    "requiem for a dream",
    "fast times at ridgemont high",
    "black hawk down",
    "frida",
]
embeddings = universal_sentence_encoder(titles)
judgments = np.inner(embeddings, embeddings)


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 9.6 GB  | Proc size: 3.9 GB
GPU RAM Free: 481MB | Used: 10960MB | Util  96% | Total 11441MB


In [16]:
%timeit get_relevance("hello world " * 10, "goodbye world " * 10)

ValueError: ignored

This is some text. **Nifty!**

In [17]:
d = titles * 200
%timeit dd = universal_sentence_encoder(d) 

1 loop, best of 3: 11.4 s per loop
