## LLM Enhanced Model Steps
1. Load the Dataset
2. Upload News Data to Vector Store
3. Create an Assistant
4. Get User History
5. Recommend News based on user history

Import Dataset

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks/
file_path = "behaviors_test.tsv"

# Read the raw data line by line
with open(file_path, encoding="utf-8") as f:
    lines = f.readlines()

# Parse each line
parsed_data = []
for line in lines:
    fields = line.strip().split("\t")

    while len(fields) < 5:
        fields.append("")  # Fill missing fields with empty strings

    # Split the History and Impressions fields into lists
    history = fields[3].split() if fields[3] else []
    impressions = fields[4].split() if fields[4] else []
    fields[3] = history
    fields[4] = impressions

    # Append the cleaned fields to the parsed data
    parsed_data.append(fields)

# Convert the parsed data into a DataFrame
column_names = ["Impression ID", "User ID", "Time", "History", "Impressions"]
behaviors_df = pd.DataFrame(parsed_data, columns=column_names)
print(behaviors_df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks
  Impression ID  User ID                 Time  \
0             5  U239687  2019-11-14 20:03:01   
1            11  U108656  2019-11-14 06:51:18   
2            12  U178651  2019-11-14 14:33:03   
3            14   U95671  2019-11-14 20:28:43   
4            16  U332817  2019-11-14 14:53:15   

                                             History  \
0  [N65250, N122359, N71723, N53796, N41663, N414...   
1  [N4833, N61319, N94639, N50163, N107002, N1120...   
2  [N112192, N82348, N80126, N78767, N7553, N8736...   
3  [N64593, N82779, N33216, N9321, N128643, N6449...   
4  [N32197, N35797, N28326, N127821, N124453, N87...   

                                         Impressions  
0  [N76209-0, N48841-0, N67937-0, N62235-0, N6307...  
1  [N95301-0, N79081-0, N103133-0, N80281-0, N703...  
2  [N76189-0, N103133-0, N20871-0,

In [None]:
from openai import OpenAI
import pandas as pd
import os
import time
api_key = 'sk-svcacct-_u0YHRRp-oJiJUfO6yvWNlN6_ZVLrPPonlct008C6Vpe-22kS2E22bpFuk0EtyZRSNHTT3BlbkFJREMKmS2I1ICkr_LH5HBBGvlzwWBPkYOkkadNjNZE32vSsp3WxIxNGP2TRVnABaLX3mgA'
client = OpenAI(api_key=api_key)

In [None]:
def encode_file_to_txt(file_path):
    # Convert the tsv file to a txt file
    txt_file_path = file_path.replace('.tsv', '.txt')
    df = pd.read_csv(file_path, sep='\t')
    df.to_csv(txt_file_path, sep='\t', index=False, header=False)

    return txt_file_path

Create Vector Store and Assistant

In [None]:
def upload_file_to_vector_store(file_path):
    txt_file_path = encode_file_to_txt(file_path)

    # Create the vector store
    vector_store_id = client.beta.vector_stores.create(name=os.path.basename(txt_file_path)).id
    with open(txt_file_path, "rb") as file_stream:
        client.beta.vector_stores.file_batches.upload_and_poll(
            vector_store_id=vector_store_id, files=[file_stream]
        )

    os.remove(txt_file_path) # clean
    return vector_store_id

def create_assistant_with_vector_store(vector_store_id):
    instructions = """
      Recommend 20 news articles based on the user's history:
      1. Match subcategories of user's news.
      2. Find similar content.
      3. Prioritize based on engagement history.

      Write the reason why you have chosen the recommendation
      Do not give me any other information

      Output format:
      | News ID   | Title                        | Reason                         |
      """
    assistant = client.beta.assistants.create(
        name="news_recommendation_assistant",
        instructions=instructions,
        model="gpt-3.5-turbo",
        tools=[{"type": "file_search"}],
        tool_resources={
            "file_search": {"vector_store_ids": [vector_store_id]}
        }
    )
    return assistant.id


Get User History

In [None]:
def get_user_history(user_id, behaviors_df, news_df):
    user_behaviors = behaviors_df[behaviors_df['User ID'] == user_id]

    user_news_ids = []
    for history in user_behaviors['History']:
        if isinstance(history, list):
            user_news_ids.extend(history)

    user_history_df = news_df[news_df['News ID'].isin(user_news_ids)][['News ID', 'Title', 'Abstract']]

    history_message = "User history:\n"
    for _, row in user_history_df.iterrows():
        history_message += f"News ID: {row['News ID']}, Title: {row['Title']}, Abstract: {row['Abstract']}\n"

    return history_message

Recommendation System

In [None]:
def recommend_news_to_user(assistant_id, user_history_message):
    try:
        # Create a thread and send the user history message
        thread = client.beta.threads.create()
        client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content=user_history_message
        )

        # Run the assistant
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=assistant_id
        )

        # Wait for the run to complete
        while run.status not in ["completed", "failed"]:
            time.sleep(2)  # Introduce a 2-second delay between checks
            run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
            print(f"Run status: {run.status}")

        if run.status == "failed":
            print(f"Run failed with error: {run.last_error}")
            return

        print("Run completed successfully.")

        # Fetch all messages from the thread
        messages = list(client.beta.threads.messages.list(thread_id=thread.id))
        for i, message in enumerate(messages):
            print(f"Message {i} - Role: {message.role}, Content: {message.content}")

        # Extract the assistant's response
        assistant_response = messages[0].content if messages else "No messages found."

        # extract text
        if isinstance(assistant_response, list):
            extracted_text = "".join(
                block.text.value for block in assistant_response if hasattr(block, 'text')
            )
        else:
            extracted_text = assistant_response

        print("Assistant response:", extracted_text)

        # Save the response to a file
        with open("recommended_news.txt", "w") as f:
            f.write(extracted_text)
            print("Recommendations written to recommended_news.txt.")

    except Exception as e:
        print(f"Error during recommendation: {e}")

Main Execution

In [None]:
if __name__ == "__main__":
    # load the data
    news_file_path = "news.tsv"
    column_names = ["Impression ID", "User ID", "Time", "History", "Impressions"]
    behaviors_df = pd.DataFrame(parsed_data, columns=column_names)
    news_df = pd.read_csv(news_file_path, sep='\t', header=None, names=[
        "News ID", "Category", "SubCategory", "Title", "Abstract", "URL",
        "Title Entities", "Abstract Entities"
    ])

    # Step 2: Upload news data to vector store
    vector_store_id = upload_file_to_vector_store(news_file_path)

    # Step 3: Create assistant with vector store
    assistant_id = create_assistant_with_vector_store(vector_store_id)
    specific_user_id = 'U239687'

    # Step 4: Retrieve and format user history
    user_history_message = get_user_history(specific_user_id, behaviors_df, news_df)

    # recommend
    recommend_news_to_user(assistant_id, user_history_message)

In [None]:
def evaluate_accuracy(behaviors_df, recommendations):
    """
    Evaluate the accuracy of the recommendations against ground truth.

    Parameters:
    - behaviors_df: DataFrame with user behaviors, including Impressions.
    - recommendations: Dictionary with User ID as keys and list of recommended News IDs as values.

    Returns:
    - accuracy: Fraction of correctly recommended articles.
    """
    total_relevant = 0
    total_recommended = 0

    for _, row in behaviors_df.iterrows():
        user_id = row['User ID']
        if user_id in recommendations:
            # Extract ground truth clicked articles
            impressions = row['Impressions']
            ground_truth = [impression.split('-')[0] for impression in impressions if '-1' in impression]

            # Extract system recommendations
            recommended = recommendations[user_id]

            # Count correct recommendations
            correct_recommendations = len(set(recommended) & set(ground_truth))
            total_relevant += len(ground_truth)
            total_recommended += correct_recommendations

    # Compute accuracy as the fraction of correctly recommended articles
    accuracy = total_recommended / total_relevant if total_relevant > 0 else 0
    return accuracy


In [None]:
accuracy = evaluate_accuracy(behaviors_df, recommendations)
print(f"Accuracy: {accuracy:.2%}")


NameError: name 'recommendations' is not defined