### Music Recommendation
This is example note book which shows how to recommend music using word embeddings. It uses Word2Vec to find songs with similar lyrics.

In [21]:
import pandas as pd
from urllib import request
import numpy as np
from gensim.models import Word2Vec

In [32]:
data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')
lines = data.read().decode("utf-8").split('\n')[2:] ## removing meta data from dataset
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]

In [33]:
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n') 
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

In [27]:
model = Word2Vec(playlists, vector_size=32, window=20, negative=50,min_count=1, workers=4)

In [28]:
def print_recommendations(song_id): 
    similar_songs = np.array(model.wv.most_similar(positive=str(song_id),topn=5))[:,0]
    return songs_df.iloc[similar_songs]

In [34]:
print_recommendations(2172) ## 2172 is the ID of the song for which we want to find similar recommendations.

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2849,Run To The Hills,Iron Maiden
5634,Mr. Brownstone,Guns N' Roses
3167,Unchained,Van Halen
5586,The Last In Line,Dio
2976,I Don't Know,Ozzy Osbourne


In [None]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    "EmergentMethods/Phi-3-mini-4k-instruct-graph",  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
) 

tokenizer = AutoTokenizer.from_pretrained("EmergentMethods/Phi-3-mini-4k-instruct-graph") 

messages = [
    {
        "role": "system",
        "content": """
A chat between a curious user and an artificial intelligence Assistant. The Assistant specializes in extracting key-value pairs from text and responds in JSON format only.

The User provides text in the format:

-------Text begin-------
<User provided text>
-------Text end-------

The Assistant follows these steps before replying to the User:

1. **Extract key-value pairs**: The Assistant identifies key-value pairs in the text. Each key represents a distinct concept, and its value provides detailed information about that concept.

The Assistant outputs the key-value pairs in the following JSON structure:

{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "key": {"type": "string"},
            "value": {"type": "string"}
        },
        "required": ["key", "value"],
        "additionalProperties": false
    }
}

The Assistant uses the text between -------Text begin------- and -------Text end------- as the sole input for identifying key-value pairs. It outputs only unique pairs and never repeats the same key.

The Assistant responds in JSON format strictly adhering to the schema above.
"""
    },
    {
        "role": "user",
        "content": """
-------Text begin-------
My name is Hassan. I am a software engineer.
-------Text end-------
"""
    }
]


pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])

# Output:

# {
#     "nodes": [
#         {
#             "id": "OpenAI",
#             "type": "organization",
#             "detailed_type": "ai research organization"
#         },
#         {
#             "id": "GPT family",
#             "type": "technology",
#             "detailed_type": "large language models"
#         },
#         {
#             "id": "DALL-E series",
#             "type": "technology",
#             "detailed_type": "text-to-image models"
#         },
#         {
#             "id": "Sora",
#             "type": "technology",
#             "detailed_type": "text-to-video model"
#         },
#         {
#             "id": "ChatGPT",
#             "type": "technology",
#             "detailed_type": "generative ai"
#         },
#         {
#             "id": "San Francisco",
#             "type": "location",
#             "detailed_type": "city"
#         },
#         {
#             "id": "California",
#             "type": "location",
#             "detailed_type": "state"
#         },
#         {
#             "id": "December 2015",
#             "type": "date",
#             "detailed_type": "foundation date"
#         },
#         {
#             "id": "November 2022",
#             "type": "date",
#             "detailed_type": "release date"
#         }
#     ],
#     "edges": [
#         {
#             "from": "OpenAI",
#             "to": "San Francisco",
#             "label": "headquartered in"
#         },
#         {
#             "from": "San Francisco",
#             "to": "California",
#             "label": "located in"
#         },
#         {
#             "from": "OpenAI",
#             "to": "December 2015",
#             "label": "founded in"
#         },
#         {
#             "from": "OpenAI",
#             "to": "GPT family",
#             "label": "developed"
#         },
#         {
#             "from": "OpenAI",
#             "to": "DALL-E series",
#             "label": "developed"
#         },
#         {
#             "from": "OpenAI",
#             "to": "Sora",
#             "label": "developed"
#         },
#         {
#             "from": "OpenAI",
#             "to": "ChatGPT",
#             "label": "released"
#         },
#         {
#             "from": "ChatGPT",
#             "to": "November 2022",
#             "label": "released in"
#         }
#     ]
# }
