## Tokenizer

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map = "cuda",
    torch_dtype = "auto",
    trust_remote_code = True
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [7]:
prompt = "Write an email apologizing to Sarah for the tragic gardening misap. Explain how it happend.<|assistant|>"

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

# Generate the text
generation_output = model.generate(
    input_ids = input_ids,
    max_new_tokens=400
)

# Print the output
print(tokenizer.decode(generation_output[0]))

Write an email apologizing to Sarah for the tragic gardening misap. Explain how it happend.<|assistant|> Subject: Sincere Apologies for the Gardening Misadventure


Dear Sarah,


I hope this message finds you well. I am writing to express my deepest apologies for the unfortunate incident that occurred in your garden during our recent visit.


As you know, I have always admired your green thumb and the lush oasis you've cultivated. It was with great enthusiasm that I offered to help with the gardening, eager to learn from your expertise and contribute to the beauty of your space.


Unfortunately, in my eagerness, I misjudged the strength of the new fertilizer I was using. In my haste, I applied it in a manner that was not recommended, leading to an unintended and regrettable outcome. The fertilizer was too potent for the delicate plants, and as a result, several of them suffered from the over-fertilization.


Please know that this was not my intention, and I am truly sorry for any distr

In [8]:
colors_list = [
    '102;194;165', '252;141;98', '141;160;203', 
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' + 
            tokenizer.decode(t) + 
            '\x1b[0m', 
            end=' '
        )

show_tokens("Write an email apologizing to Sarah for the tragic gardening misap. Explain how it happend.<|assistant|> Subject: Sincere Apologies for the Gardening Misadventure", 
            "microsoft/Phi-3-mini-4k-instruct")

[0;30;48;2;102;194;165mWrite[0m [0;30;48;2;252;141;98man[0m [0;30;48;2;141;160;203memail[0m [0;30;48;2;231;138;195mapolog[0m [0;30;48;2;166;216;84mizing[0m [0;30;48;2;255;217;47mto[0m [0;30;48;2;102;194;165mSarah[0m [0;30;48;2;252;141;98mfor[0m [0;30;48;2;141;160;203mthe[0m [0;30;48;2;231;138;195mtrag[0m [0;30;48;2;166;216;84mic[0m [0;30;48;2;255;217;47mgarden[0m [0;30;48;2;102;194;165ming[0m [0;30;48;2;252;141;98mmis[0m [0;30;48;2;141;160;203map[0m [0;30;48;2;231;138;195m.[0m [0;30;48;2;166;216;84mExp[0m [0;30;48;2;255;217;47mlain[0m [0;30;48;2;102;194;165mhow[0m [0;30;48;2;252;141;98mit[0m [0;30;48;2;141;160;203mhapp[0m [0;30;48;2;231;138;195mend[0m [0;30;48;2;166;216;84m.[0m [0;30;48;2;255;217;47m<|assistant|>[0m [0;30;48;2;102;194;165mSub[0m [0;30;48;2;252;141;98mject[0m [0;30;48;2;141;160;203m:[0m [0;30;48;2;231;138;195mS[0m [0;30;48;2;166;216;84minc[0m [0;30;48;2;255;217;47mere[0m [0;30;48;2;102;194;165mAp[0m [0;30;48

## Token Embedding

In [11]:
from transformers import AutoModel, AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# Load a language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

# Tokenize the sentence
tokens = tokenizer('Hello World', return_tensors='pt')

# Process the token
output = model(**tokens)[0]

In [12]:
output.shape

torch.Size([1, 4, 384])

In [14]:
for token in tokens['input_ids'][0]:
    print(tokenizer.decode(token))

[CLS]
Hello
 World
[SEP]


In [15]:
output

tensor([[[-3.2520,  0.1818, -0.1254,  ..., -0.0502, -0.2334,  0.8897],
         [-0.4673,  0.1730, -0.0206,  ..., -0.5289,  0.7303,  2.1177],
         [-0.4950,  0.0564,  0.2842,  ...,  1.0543, -0.1747,  1.3793],
         [-2.9601,  0.2129, -0.1138,  ...,  0.1518, -0.2094,  1.0494]]],
       grad_fn=<NativeLayerNormBackward0>)

In [17]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Convert text to text embeddings
vector = model.encode("Best movie ever!")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
vector.shape

(768,)

In [20]:
import gensim.downloader as api

# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
# Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data

model = api.load("glove-wiki-gigaword-50")

In [29]:
model.most_similar([model['queen']], topn=11)

[('queen', 1.0000001192092896),
 ('princess', 0.8515165448188782),
 ('lady', 0.8050609230995178),
 ('elizabeth', 0.7873042225837708),
 ('king', 0.7839043140411377),
 ('prince', 0.7821860313415527),
 ('coronation', 0.7692778706550598),
 ('consort', 0.7626097202301025),
 ('royal', 0.7442865371704102),
 ('crown', 0.738264799118042),
 ('victoria', 0.7285772562026978)]

## Song Embedding

In [2]:
import pandas as pd
from urllib import request

# Get the playlist dataset file
data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')

# Parse the playlist dataset file. Skip the first 2 lines as they only contain metadata
lines = data.read().decode("utf-8").split('\n')[2:]

# Remove playlists with only one song
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]

# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n')
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

In [13]:
print('Playlist #1:\n', playlists[0], '\n')
print('Playlist #2:\n', playlists[1], '\n')

Playlist #1:
 ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'] 

Playlist #2:
 ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117', '

In [3]:
from gensim.models import Word2Vec

# Train our Word2Vec model
model = Word2Vec(
    playlists, vector_size=32, window=20, negative=50, min_count=1, workers=4
)

In [4]:
song_id = 2172

# Ask the model for songs similar to song #2172
model.wv.most_similar(positive=str(song_id))

[('2849', 0.9974465370178223),
 ('6626', 0.9961413741111755),
 ('6624', 0.9961063265800476),
 ('3094', 0.9958934187889099),
 ('3167', 0.9956844449043274),
 ('2014', 0.9955931305885315),
 ('2976', 0.9955208897590637),
 ('3126', 0.9947397112846375),
 ('5633', 0.9946460127830505),
 ('1922', 0.994563639163971)]

In [5]:
print(songs_df.iloc[2172])

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object


In [8]:
import numpy as np

def print_recommendation(song_id):
    similar_songs = np.array(
        model.wv.most_similar(positive=str(song_id), topn=5)
    )[:,0]
    return songs_df.iloc[similar_songs]

# Extract recommendations
print_recommendation(2172)

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2849,Run To The Hills,Iron Maiden
6626,Blackout,Scorpions
6624,Everybody Wants Some!!!,Van Halen
3094,Breaking The Law,Judas Priest
3167,Unchained,Van Halen
