# Approche de classification avec un llm en utilisant l'API de ollama

## Chargement des données

In [None]:
from pathlib import Path
from dataset import load_data_film


DATASET_PATH_TRAIN = Path("data/aclImdb/train")
DATASET_PATH_TEST = Path("data/aclImdb/test")

NB_DOC_MAX = 1000 # par classe
IMDB_CLASSES  = ['neg','pos']
VOC_SIZE = 10000
BATCH_SIZE = 32


txts, files, filelabels = load_data_film(DATASET_PATH_TRAIN, max_size = NB_DOC_MAX)
txts_test, files_test, filelabels_test = load_data_film(DATASET_PATH_TEST, max_size = NB_DOC_MAX)

## Connexion avec API ollama

In [27]:
import ollama
import re

def classification_zero_shot(exemple):
    prompt = f"Give the sentiment of the following text in one word: 'positive' or 'negative' :\n\n{exemple}"
    response = ollama.generate(model='qwen:0.5b',prompt=prompt)
    pattern = r"positive|negative"
    match = re.search("positive|negative", response.response, flags=re.I)
    if match:
        if match.group(0).lower() == "positive":
            return 1
        else : 
            return 0
    return None

print(txts[1])


Alex D. Linz replaces Macaulay Culkin as the central figure in the third movie in the Home Alone empire. Four industrial spies acquire a missile guidance system computer chip and smuggle it through an airport inside a remote controlled toy car. Because of baggage confusion, grouchy Mrs. Hess (Marian Seldes) gets the car. She gives it to her neighbor, Alex (Linz), just before the spies turn up. The spies rent a house in order to burglarize each house in the neighborhood until they locate the car. Home alone with the chicken pox, Alex calls 911 each time he spots a theft in progress, but the spies always manage to elude the police while Alex is accused of making prank calls. The spies finally turn their attentions toward Alex, unaware that he has rigged devices to cleverly booby-trap his entire house. Home Alone 3 wasn't horrible, but probably shouldn't have been made, you can't just replace Macauley Culkin, Joe Pesci, or Daniel Stern. Home Alone 3 had some funny parts, but I don't like 

In [26]:
classification_zero_shot(txts[102])

False


GenerateResponse(model='qwen:0.5b', created_at='2025-01-28T15:33:19.873117486Z', done=True, done_reason='stop', total_duration=2979248567, load_duration=60739802, prompt_eval_count=258, prompt_eval_duration=2856000000, eval_count=2, eval_duration=60000000, response='negative', context=[151644, 872, 198, 35127, 279, 25975, 315, 279, 2701, 1467, 304, 825, 3409, 25, 220, 6, 30487, 6, 476, 220, 6, 42224, 6, 220, 25, 198, 198, 40, 6551, 419, 5700, 264, 220, 20, 700, 315, 10526, 56943, 13, 3017, 14602, 374, 537, 311, 20524, 5489, 6, 82, 23611, 11, 1576, 358, 6, 586, 3884, 11, 438, 358, 6, 586, 78569, 2061, 1526, 1008, 6042, 11, 429, 419, 5700, 374, 5008, 25808, 553, 1657, 13, 8325, 11, 432, 374, 537, 5802, 894, 86078, 11, 323, 358, 1977, 419, 1576, 358, 6, 586, 3884, 279, 4024, 22809, 11, 65970, 3187, 11, 323, 419, 5700, 557, 971, 432, 17247, 13, 18765, 421, 498, 1490, 34449, 18337, 7484, 11, 498, 1410, 1075, 432, 13, 1084, 374, 31080, 4152, 311, 279, 1632, 12, 331, 460, 90573, 24554, 11, 31

In [30]:
from tqdm import tqdm

output = []
for txt in tqdm(txts[:10]):
    output.append(classification_zero_shot(txt))
print(output)


100%|██████████| 10/10 [00:38<00:00,  3.85s/it]

[0, 0, None, None, 0, 1, 0, 0, 0, 1]





## Performance du modèle 0

In [37]:
import random as rd
import numpy as np


idx = rd.sample([i for i in range(2*NB_DOC_MAX)],100)


data = np.array(txts)[idx]
labels = np.array(filelabels)[idx]



In [38]:
output = np.array([])
for txt in tqdm(data):
    output.append(classification_zero_shot(txt))
print(output)

100%|██████████| 100/100 [06:06<00:00,  3.67s/it]

[1, None, 0, 0, 0, None, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, None, None, 1, 1, 0, 1, None, 0, 1, 0, 0, 1, 1, 1, 0, 1, None, 0, 1, None, 1, 1, 0, 0, 1, None, 1, 0, 1, 0, 1, 0, 1, None, 0, 0, None, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, None, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1]





In [51]:
accuracy = (output == labels).sum()/100
non_conclusive = (output == None).sum()/100

## Campagne de fine-tuning
Objectif de la campagne : 
 - tester plusieurs prompts : prompt naif/ prompt généré par chatgpt
 - format de sorti : text + regexp / json 
 - modèle : qwen / qwen2 / qwen2.5
