# Approche de classification avec un llm en utilisant l'API de ollama

## 1 Chargement des données

In [11]:
from pathlib import Path
from dataset import load_data_film


DATASET_PATH_TRAIN = Path("data/aclImdb/train")
DATASET_PATH_TEST = Path("data/aclImdb/test")

NB_DOC_MAX_Tr = 1000 # par classe
NB_DOC_MAX_Te = 100
IMDB_CLASSES  = ['neg','pos']
VOC_SIZE = 10000
BATCH_SIZE = 32


txts, files, filelabels = load_data_film(DATASET_PATH_TRAIN, max_size = NB_DOC_MAX_Tr)
txts_test, files_test, labels_test = load_data_film(DATASET_PATH_TEST, max_size = NB_DOC_MAX_Te)

## 2 Connexion avec API ollama

In [30]:
import ollama
import re

def classification_zero_shot(exemple):
    prompt = f"Give the sentiment of the following text in one word: 'positive' or 'negative' :\n\n{exemple}"
    response = ollama.generate(model='qwen:0.5b',prompt=prompt)
    pattern = r"positive|negative"
    match = re.search("positive|negative", response.response, flags=re.I)
    if match:
        if match.group(0).lower() == "positive":
            return 1
        else : 
            return 0
    return response.response


text_pos = 'Fair drama/love story movie that focuses on the lives of blue collar people\
finding new life thru new love.The acting here is good but the film fails in cinematography,screenplay,\
directing and editing.The story/script is only average at best.This film will be enjoyed by Fonda and De\
Niro fans and by people who love middle age love stories where in the coartship is on a more wiser and\
cautious level.It would also be interesting for people who are interested on the subject matter regarding illiteracy.......' #6_10 train

text_neg = "If I had not read Pat Barker's 'Union Street' before seeing this film, I would have liked it. \
Unfortuntately this is not the case. It is actually my kind of film, it is well made, and in no way do I\
want to say otherwise, but as an adaptation, it fails from every angle.<br /><br />The harrowing novel\
about the reality of living in a northern England working-class area grabbed hold of my heartstrings\
and refused to let go for weeks after I had finished. I was put through tears, repulsion, shock,\
anger, sympathy and misery when reading about the women of Union Street. Excellent. A novel \
that at times I felt I could not read any more of, but I novel I simply couldn't put down.\
Depressing yes, but utterly gripping.<br /><br />The film. Oh dear. Hollywood took Barker's truth and reality, \
and showered a layer of sweet icing sugar over the top of it. A beautiful\
film, an inspiring soundtrack, excellent performances, a tale of hope and romance...yes. An adaptation of \
'Union Street'...no.<br /><br />The women of Union Street and their stories are condensed into Fonda's character,\
their stories are touched on, but many are discarded. I accept that some of Barker's tales are sensitive issues and\
are too horrific for mass viewing, and that a film with around 7 leading protagonists just isn't practical, but\
the content is not my main issue. The essence and the real gut of the novel is lost - darkness and rain,\
broken windows covered with cardboard, and the graphically described stench of poverty is replaced with sunshine,\
pretty houses, and a twinkling William's score.<br /><br />If you enjoyed the film for its positivity and hope in the\
face of 'reality', I advise that you hesitate to read the book without first preparing yourself for something more like\
'Schindler's List'...but without the happy ending."


def classification_few_shot(exemple):
    prompt = f"Give the sentiment of the following text in one word 'positive' or 'negative'. Here is some example \n Input : {text_pos[:1000]} - Expected output : positive \n Input : {text_neg[:1000]} - Expected output : negative \n \n\n{exemple}"
    response = ollama.generate(model='qwen:0.5b',prompt=prompt)
    pattern = r"positive|negative"
    match = re.search("positive|negative", response.response, flags=re.I)
    if match:
        if match.group(0).lower() == "positive":
            return 1
        else : 
            return 0
    return response.response


print(txts[1])


Alex D. Linz replaces Macaulay Culkin as the central figure in the third movie in the Home Alone empire. Four industrial spies acquire a missile guidance system computer chip and smuggle it through an airport inside a remote controlled toy car. Because of baggage confusion, grouchy Mrs. Hess (Marian Seldes) gets the car. She gives it to her neighbor, Alex (Linz), just before the spies turn up. The spies rent a house in order to burglarize each house in the neighborhood until they locate the car. Home alone with the chicken pox, Alex calls 911 each time he spots a theft in progress, but the spies always manage to elude the police while Alex is accused of making prank calls. The spies finally turn their attentions toward Alex, unaware that he has rigged devices to cleverly booby-trap his entire house. Home Alone 3 wasn't horrible, but probably shouldn't have been made, you can't just replace Macauley Culkin, Joe Pesci, or Daniel Stern. Home Alone 3 had some funny parts, but I don't like 

In [18]:
classification_zero_shot(txts[102])

0

## 3 Performance du modèle zero-shot

In [13]:
import random as rd
import numpy as np


rd.seed(42)

test_idx = rd.sample([i for i in range(2*NB_DOC_MAX_Te)], k = int(0.3 * NB_DOC_MAX_Te * 2))

test_txt = txts_test[test_idx]; val_txt = np.delete(txts_test,test_idx)
test_label = labels_test[test_idx]; val_label = np.delete(labels_test,test_idx) 

print(test_idx)

data = val_txt
labels = val_label



[163, 28, 6, 189, 70, 62, 57, 35, 188, 26, 173, 139, 22, 151, 108, 8, 7, 23, 55, 59, 129, 154, 197, 143, 50, 166, 191, 107, 56, 114, 150, 71, 1, 40, 185, 87, 168, 39, 181, 86, 190, 182, 97, 24, 91, 88, 67, 11, 117, 137, 31, 96, 20, 141, 75, 92, 49, 17, 152, 58]


In [31]:
from tqdm import tqdm
NUMBER_VAL = 100

output = []
for txt in tqdm(val_txt[:100]):
    output.append(classification_zero_shot(txt))

output = np.array(output)
print(output)

100%|██████████| 100/100 [05:50<00:00,  3.51s/it]

['1'
 'mentiment towards his father, Sandy, who has been caught using marijuana.'
 'subways have made me even jadged to movie gore.' '0' '0' '1' '0' '0'
 "like the movie serious, but it doesn't feel too much like a parody." '0'
 '0' '0' '0'
 "father.\n\nI watched that movie yesterday with my father.\n\nThat's the best thriller of 1999!"
 '0' '0' '1' '0' '0' '0' '0' '0' '0' '0'
 '是一部充满刺激和痛苦的电影，讲述了主角面对危险和困境时的内心挣扎和无助。总的来说，这部电影充满了紧张感和挑战性，同时也反映了人类对于未知和恐惧的追求。'
 'realize that they are not different in their field of study. This creates a sense of confusion and frustration among the characters.\n\nAdditionally, the use of college shootings as the focal point for its script highlights the lack of critical thinking skills among the characters.\n\nTherefore, the text expresses disappointment and frustration among the characters.'
 '0' '0' '0' '1' '0' '1' '0' '0' '0' 'for the love of their country.' '钝.'
 '0' '0' 'shocking'
 'is still present, so we copy it from another world multiple times.' '0'





In [22]:
accuracy = (output == val_label[:100]).sum()/100
non_conclusive = (output == None).sum()/100

In [23]:
print(accuracy, non_conclusive)

0.56 0.27


avec un autre modèle ?

## 4 Performance d'un modèle few-shot

In [26]:
from tqdm import tqdm
NUMBER_VAL = 100

output = []
for txt in tqdm(val_txt[:100]):
    output.append(classification_few_shot(txt))

output = np.array(output)
print(output)

100%|██████████| 100/100 [11:23<00:00,  6.83s/it]

[0 None 1 None 0 1 0 1 None 0 0 0 0 None None 0 1 None 0 None None 0 0 0
 None 1 0 0 0 1 0 None 0 None None 0 None 0 0 None None None 0 0 None None
 0 0 None None None 0 None 1 0 0 0 0 None 0 0 0 0 0 None 0 0 0 None None 0
 None 0 1 None 0 0 1 0 None None None 1 0 1 None 1 0 None 1 None None 1
 None None 1 1 0 1 None]





In [27]:
accuracy = (output == val_label[:100]).sum()/100
non_conclusive = (output == None).sum()/100

In [28]:
print(accuracy,non_conclusive)

0.44 0.38


## Campagne de fine-tuning
Objectif de la campagne : 
 - tester plusieurs prompts : prompt naif/ prompt généré par chatgpt
 - format de sorti : text + regexp / json 
 - modèle : qwen / qwen2 / qwen2.5


#### Plot les variations de perf en fonction de l'approche choisie