In [None]:
%%capture
# update or install the necessary libraries
!pip install --upgrade openai
!pip install --upgrade langchain
!pip install --upgrade python-dotenv

In [None]:
import openai
import os
import IPython
from langchain.llms import OpenAI
from dotenv import load_dotenv

import json
import pandas as pd
from pprint import pprint

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "key"))

In [None]:
from google.colab import files, drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/netflix_titles.csv")

df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,48,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,27.612,8.2
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,18.216,7.8
3,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,17.505,7.8
4,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,95.337,7.7


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5806 entries, 0 to 5805
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5806 non-null   object 
 1   title                 5805 non-null   object 
 2   type                  5806 non-null   object 
 3   description           5788 non-null   object 
 4   release_year          5806 non-null   int64  
 5   age_certification     3196 non-null   object 
 6   runtime               5806 non-null   int64  
 7   genres                5806 non-null   object 
 8   production_countries  5806 non-null   object 
 9   seasons               2047 non-null   float64
 10  imdb_id               5362 non-null   object 
 11  imdb_score            5283 non-null   float64
 12  imdb_votes            5267 non-null   float64
 13  tmdb_popularity       5712 non-null   float64
 14  tmdb_score            5488 non-null   float64
dtypes: float64(5), int64(

<h2>Data preparation</h2>

In [None]:
df.columns

Index(['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity',
       'tmdb_score'],
      dtype='object')

In [None]:
# Function to prepare example conversations based on the dataset
def prepare_example_conversation(row):
    # System message that explains the role of the model
    system_message = "You are an intelligent movie recommendation assistant. You should use the provided information to make content suggestions based on user queries."

    # Prepare the user message with details about the show or movie
    user_message = f"Title: {row['title']} | Type: {row['type']} | Description: {row['description']} | Release Year: {row['release_year']} | Runtime: {row['runtime']} minutes | Genres: {row['genres']} | Seasons: {row['seasons']} | IMDb Score: {row['imdb_score']} | IMDb Votes: {row['imdb_votes']}"

    # Prepare the conversation
    messages = []
    messages.append({"role": "system", "content": system_message})
    messages.append({"role": "user", "content": user_message})

    # Assistant's response using the details (use any relevant data or prompt completions)
    completion = f"Based on the information, I recommend watching {row['title']} if you enjoy {row['genres']} genre with a good rating of {row['imdb_score']}."
    messages.append({"role": "assistant", "content": completion})

    return {"messages": messages}

# Apply the function to the entire dataset (or a subset if needed)
training_data = df.apply(prepare_example_conversation, axis=1).tolist()

# Function to write the examples to a jsonl file
def write_jsonl(data_list, filename):
    with open(filename, 'w') as outfile:
        for entry in data_list:
            json.dump(entry, outfile)
            outfile.write('\n')

# Writing out the training data
write_jsonl(training_data, 'training_data.jsonl')

# Displaying a few examples to verify
for example in training_data[:5]:
    print(example)

{'messages': [{'role': 'system', 'content': 'You are an intelligent movie recommendation assistant. You should use the provided information to make content suggestions based on user queries.'}, {'role': 'user', 'content': 'Title: Five Came Back: The Reference Films | Type: SHOW | Description: This collection includes 12 World War II-era propaganda films — many of which are graphic and offensive — discussed in the docuseries "Five Came Back." | Release Year: 1945 | Runtime: 48 minutes | Genres: [\'documentation\'] | Seasons: 1.0 | IMDb Score: nan | IMDb Votes: nan'}, {'role': 'assistant', 'content': "Based on the information, I recommend watching Five Came Back: The Reference Films if you enjoy ['documentation'] genre with a good rating of nan."}]}
{'messages': [{'role': 'system', 'content': 'You are an intelligent movie recommendation assistant. You should use the provided information to make content suggestions based on user queries.'}, {'role': 'user', 'content': "Title: Taxi Driver 

In [None]:
# Prepare training data from a subset of the dataset
training_df = df.iloc[5577:5777]
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

# Prepare validation data from a different subset to avoid overlap
validation_df = df.iloc[5777:]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()

# Function to write the data to jsonl files
def write_jsonl(data_list, filename):
    with open(filename, 'w') as outfile:
        for entry in data_list:
            json.dump(entry, outfile)
            outfile.write('\n')

# Write the training and validation data to jsonl files
training_file_name = "netflix_titles_finetune_training.jsonl"
validation_file_name = "netflix_titles_finetune_validation.jsonl"

write_jsonl(training_data, training_file_name)
write_jsonl(validation_data, validation_file_name)

In [None]:
# Print the first 5 lines of the training file to check
!head -n 5 netflix_titles_finetune_training.jsonl

{"messages": [{"role": "system", "content": "You are an intelligent movie recommendation assistant. You should use the provided information to make content suggestions based on user queries."}, {"role": "user", "content": "Title: Love Tactics | Type: MOVIE | Description: An ad executive and a fashion designer-blogger don't believe in love, so they place a bet to make the other fall head over heels - with unusual tactics. | Release Year: 2022 | Runtime: 97 minutes | Genres: ['romance'] | Seasons: nan | IMDb Score: 5.1 | IMDb Votes: 3436.0"}, {"role": "assistant", "content": "Based on the information, I recommend watching Love Tactics if you enjoy ['romance'] genre with a good rating of 5.1."}]}
{"messages": [{"role": "system", "content": "You are an intelligent movie recommendation assistant. You should use the provided information to make content suggestions based on user queries."}, {"role": "user", "content": "Title: Roohi | Type: MOVIE | Description: Roohi is set in a fictional town

In [None]:
# Upload the training file
with open(training_file_name, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
    )
training_file_id = training_response.id

# Upload the validation file
with open(validation_file_name, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-CFoWm7gtPATgkpg02X2JGbep
Validation file ID: file-ZeNM4TO3BNSUX4V8thDgROT0


## Fine-tuning


In [None]:
# Create the fine-tuning job
try:
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        validation_file=validation_file_id,
        model="gpt-3.5-turbo",
        suffix="recommendation"
    )
    job_id = response.id
    print("Job ID:", job_id)
    print("Status:", response.status)
except Exception as e:
    print("Error creating fine-tuning job:", str(e))

# Function to check the status of the fine-tuning job
def check_job_status(job_id):
    response = client.fine_tuning.jobs.retrieve(job_id)
    print("Job ID:", response.id)
    print("Status:", response.status)
    if 'trained_tokens' in response:
        print("Trained Tokens:", response.trained_tokens)

# Call the function to check job status
check_job_status(job_id)

Job ID: ftjob-umRkiIYpheQiN47F3fDeeQL9
Status: validating_files
Job ID: ftjob-umRkiIYpheQiN47F3fDeeQL9
Status: validating_files


We can track the progress of the fine-tune with the events endpoint. You can rerun the cell below a few times until the fine-tune is ready.

In [None]:
# Function to get events of the fine-tuning job
def get_job_events(job_id):
    response = client.fine_tuning.jobs.list_events(job_id)
    events = response.data
    events.reverse()  # Reverse to show the latest events first
    for event in events:
        print(event.message)

# Optionally, call the function to get job events
get_job_events(job_id)

Step 585/600: training loss=0.00
Step 586/600: training loss=0.00
Step 587/600: training loss=0.00
Step 588/600: training loss=0.00
Step 589/600: training loss=0.00
Step 590/600: training loss=0.00, validation loss=0.00
Step 591/600: training loss=0.00
Step 592/600: training loss=0.00
Step 593/600: training loss=0.00
Step 594/600: training loss=0.00
Step 595/600: training loss=0.00
Step 596/600: training loss=0.00
Step 597/600: training loss=0.00
Step 598/600: training loss=0.00
Step 599/600: training loss=0.00
Step 600/600: training loss=0.00, validation loss=0.00, full validation loss=0.00
Checkpoint created at step 200 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:recommendation:9M7yOybX:ckpt-step-200
Checkpoint created at step 400 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:recommendation:9M7yOLC0:ckpt-step-400
New fine-tuned model created: ft:gpt-3.5-turbo-0125:personal:recommendation:9M7yOOo6
The job has successfully completed


In [None]:
# Function to retrieve the fine-tuned model ID
def get_fine_tuned_model_id(job_id):
    response = client.fine_tuning.jobs.retrieve(job_id)
    if response.fine_tuned_model is None:
        raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")
    return response.fine_tuned_model

# Once the job is done, retrieve the fine-tuned model ID
fine_tuned_model_id = get_fine_tuned_model_id(job_id)
print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0125:personal:recommendation:9M7yOOo6


## Inference

In [None]:
test_df = df.iloc[:100]
test_row = test_df.iloc[4]

# System message explaining the role of the model
system_message = "You are an intelligent movie recommendation assistant. You should use the provided information to make content suggestions based on user queries."

# Prepare the user message based on the selected row
def create_user_message(row):
    return f"Title: {row['title']} | Type: {row['type']} | Description: {row['description']} | Release Year: {row['release_year']} | Runtime: {row['runtime']} minutes | Genres: {row['genres']} | Seasons: {row['seasons']} | IMDb Score: {row['imdb_score']} | IMDb Votes: {row['imdb_votes']}"

user_message = create_user_message(test_row)

# Prepare the messages for the chat completion request
test_messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_message}
]

# Print the prepared messages for verification
from pprint import pprint
pprint(test_messages)

[{'content': 'You are an intelligent movie recommendation assistant. You '
             'should use the provided information to make content suggestions '
             'based on user queries.',
  'role': 'system'},
 {'content': 'Title: The Exorcist | Type: MOVIE | Description: 12-year-old '
             'Regan MacNeil begins to adapt an explicit new personality as '
             'strange events befall the local area of Georgetown. Her mother '
             'becomes torn between science and superstition in a desperate bid '
             'to save her daughter, and ultimately turns to her last hope: '
             'Father Damien Karras, a troubled priest who is struggling with '
             'his own faith. | Release Year: 1973 | Runtime: 133 minutes | '
             "Genres: ['horror'] | Seasons: nan | IMDb Score: 8.1 | IMDb "
             'Votes: 391942.0',
  'role': 'user'}]


In [None]:
# Make a chat completion request to the fine-tuned model
response = client.chat.completions.create(
    model=fine_tuned_model_id,  # Use your fine-tuned model ID
    messages=test_messages,
    temperature=0,  # Set to 0 for deterministic, consistent responses
    max_tokens=500  # Adjust based on how verbose you expect the model to be
)

# Print the model's response
print(response.choices[0].message.content)

Based on the information, I recommend watching The Exorcist if you enjoy ['horror'] genre with a good rating of 8.1.


<h2>Asking the fine-tuned model for recommendations.</h2>

In [None]:
# Function to ask a question to the fine-tuned model using the updated API
def ask_model(question, fine_tuned_model_id):
    # Prepare the system and user messages
    system_message = {
        "role": "system",
        "content": "You are an intelligent movie recommendation assistant."
    }
    user_message = {
        "role": "user",
        "content": question
    }
    messages = [system_message, user_message]  # Combine into a list of messages

    try:
        response = client.chat.completions.create(
            model=fine_tuned_model_id,  # Use your fine-tuned model ID
            messages=messages,
            max_tokens=150,  # Adjust as necessary
            n=1,
            stop=None  # Define any stopping criteria if needed
        )
        return response
    except Exception as e:
        print("Error in asking model:", str(e))
        return []

# Example questions
questions = [
    "Suggest me a movie in horror genre with imdb rating of 8 and above.",
    "Suggest me a TV show from the fantasy/science fiction genre and has less than 9 seasons.",
    "Suggest me a movie in comedy and romance genre.",
    "Suggest me a TV show from horror genre that was recently released.",
    "Suggest a movie from action genre",
    "Suggest a movie from drama genre with imdb rating of 7 and above."
]

for count, question in enumerate(questions):
    print('\n')
    print(f"{count+1}. Question: {question}")
    responses = ask_model(question, fine_tuned_model_id)
    print(responses.choices[0].message.content)




1. Question: Suggest me a movie in horror genre with imdb rating of 8 and above.
I recommend watching "The Conjuring" if you enjoy horror genre with a good rating of 8 and above.


2. Question: Suggest me a TV show from the fantasy/science fiction genre and has less than 9 seasons.
I recommend watching Stranger Things if you enjoy the fantasy/science fiction genre and it has 8 seasons.


3. Question: Suggest me a movie in comedy and romance genre.
I recommend watching Crazy, Stupid, Love.


4. Question: Suggest me a TV show from horror genre that was recently released.
I recommend watching "Midnight Mass" if you enjoy horror genre and it was recently released.


5. Question: Suggest a movie from action genre
I recommend watching Die Hard if you enjoy action genre.


6. Question: Suggest a movie from drama genre with imdb rating of 7 and above.
I recommend watching The Shawshank Redemption, if you enjoy drama genre with imbd rating of 7 and above.
