<a href="https://colab.research.google.com/github/jppaolim/homer-22/blob/master/V6c_Eval_Gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Va prendre un modèle pytorch et l'évaluer sur des tâches de génération


In [1]:
NEED_PIP = False 

ModelName="v61_Large_2E"
RunName="r1"

BaseRepoPath = "/home/studio-lab-user/homer22/homer-22/"

#TGT_SAMPLES IS THE TOTAL NUMBER OF STORIES WE WANT TO GENERATE 
TGT_SAMPLES = 5000

#MAXRUN SAMPLES IS THE NUMBER OF STORIES WE WANT TO GENERATE IN THIS RUN 
MAXRUN_SAMPLES = 1000

dataPath=BaseRepoPath+"data/"
Model2Load = "jppaolim/"+ModelName

seed = int(42)

do_generate = True

#we do mauve if we are not in generate mode, or if we are in generate mode we have finished generating the samples ... 
if (not do_generate) :
    do_mauve = True
else:
    do_mauve = False

#Override do mauve if needed 
#do_mauve = True 




# Init

In [2]:
if NEED_PIP:
    %pip install torch torchvision
    %pip install transformers
    %pip install datasets
    %pip install mauve-text
    

In [3]:
import torch

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_metric,load_from_disk
from tqdm import tqdm

import random
import time
import re
import csv
import json
import sys



ISGPU = torch.cuda.is_available()

if ISGPU:
    !nvidia-smi

device = "cuda:0" if ISGPU else "cpu"
random.seed(seed)


In [4]:
Sampling_parameters = {
    'top_p' : '',
    'top_k' : '',
    'temperature' : '',
    'repetition_penalty' :''
}

def Set_Sampling_parameters(top_p, top_k, temperature, repetition_penalty) :
  Sampling_parameters['top_p'] = top_p
  Sampling_parameters['top_k'] = top_k
  Sampling_parameters['temperature'] = temperature 
  Sampling_parameters['repetition_penalty'] = repetition_penalty
  return

def regularizeDot(str):
    return re.sub(r'(?<=[.,])(?=[^\s])', r' ',str)

def BuildFileName(StoryOrigin):
   
    if StoryOrigin != "Machine" and StoryOrigin != "Human":
        sys.exit("Error:  should be either Machine or Human") 
    
    return "./Stories/"+ModelName+RunName+StoryOrigin+"Stories.csv"
   

#return a list of existing stories
def readExistingStory(StoryOrigin):
    
    csvFileName = BuildFileName(StoryOrigin)
    
    ExistingStories = []
    
    try: 
        with open(csvFileName, 'r') as csvfile:
            reader = csv.reader(csvfile, quoting=csv.QUOTE_ALL)
            for row in reader:
                ExistingStories.append(row[0])
    except FileNotFoundError:
            pass
    
    return ExistingStories

#not useful 
def writeFromTo(List, Destination, startIndex, endIndex):
    
    with open(Destination, 'a') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        for story in List[startIndex:endIndex]:
          writer.writerow([story])


def SaveResultData():
    with open('EvalResult.json', 'w') as f:
        json.dump(ResultData, f)
    return

    

In [5]:
MachineStories = readExistingStory("Machine")
HumanStories = readExistingStory("Human")

if len(MachineStories) != len(HumanStories):
    sys.exit("Warning, there is not the same numbers of stories, please review") 

#writeFromTo(HumanStories, BuildFileName("Human"),0,699) #it will write the human stories till index 698 so 699 stories 
    
try:
    with open('EvalResult.json') as f:
        ResultData = json.load(f)
except FileNotFoundError:
    ResultData = []
    
def updateResultData(ResultData, CurrentResultIdx, Metric, Value):
    ResultData[CurrentResultIdx][Metric] = Value
    return 

#print(ResultData) 
CurrentResultIdx = [idx for idx, element in enumerate(ResultData) if element['model']==Model2Load and element['Run']==RunName ]

if len(CurrentResultIdx ) > 1:
  sys.exit("Warning, there are multiple run with same model, same run")  

elif len(CurrentResultIdx ) == 1:
    print("Resuming from last time")
    CurrentResultIdx = CurrentResultIdx[0]
    print(ResultData[CurrentResultIdx])
    AccumulatedGentime = ResultData[CurrentResultIdx]['Gentime']
    AccumulatedSamples = ResultData[CurrentResultIdx]['Samples']
    
elif len(CurrentResultIdx) == 0:
    print("First time we launch this couple Model / Run")
    AccumulatedGentime = 0
    CurrentResult = {'model' : Model2Load, 'Run' : RunName, 'Samples' :  int(0),  'mauve' : float(0), 'Gentime' : float(0), 'MauvTime' : float(0) }
    CurrentResultIdx = len(ResultData)
    ResultData.append(CurrentResult)
    SaveResultData()

else :    
    sys.exit("Code Mistake")  


StartTime = time.time()
Gentime = 0
        


Resuming from last time
{'model': 'jppaolim/v61_Large_2E', 'Run': 'r1', 'Samples': 4536, 'mauve': 0.0, 'Gentime': 143607.32665753365, 'MauvTime': 0.0}


# Generate

In [6]:
if do_generate:
    tokenizer = AutoTokenizer.from_pretrained(Model2Load)
    model = AutoModelForCausalLM.from_pretrained(Model2Load)
    model.to(device)

In [7]:
if do_generate:
    
    start_index = len(MachineStories)
    print("already loaded {} stories".format(start_index))
    #print("already loaded {} human stories".format(start_index))
    

    #loading the 100K human titles, and human stories
    ds = load_from_disk(dataPath+"TitleAndStory/")
    
    #select TGT SAMPLES of them randomly  
    #index = random.sample(range(ds['train'].num_rows),TGT_SAMPLES)[start_index:]
    index = random.sample(range(ds['train'].num_rows),TGT_SAMPLES)
    ds4Compute=ds['train'].select(index)
    
    #Put the stories in HumanStories and the titles in Prompts 
    HumanStories = ds4Compute["fullstory"]
    PromptList = ds4Compute["storytitle"]
    
    print("Here is story 4 ")
    print(PromptList[3])
    print(HumanStories[3])
    if start_index >= 3:
        print(MachineStories[3])
    
    


already loaded 4536 stories
Here is story 4 
Empty Club.
Empty Club. Kate and her friends were at a dead club in Las Vegas. There were less than 100 people inside. The dance floor was almost empty. But they decided to make the most of it. They stood on the dance floor pretending to enjoy themselves.
Empty Club. The boys were bored at school. They decided to go to the empty club to talk. After a few minutes, nobody wanted to talk to them. They decided to go home instead. They didn't have a good time that day.


In [None]:
if do_generate:

    end_index = min(TGT_SAMPLES, start_index + MAXRUN_SAMPLES)
    IsNotInterrupted = 0
    
    with open(BuildFileName("Machine"), 'a') as csvfile1, open(BuildFileName("Human"), 'a') as csvfile2 :
        
        writerMachine = csv.writer(csvfile1, quoting=csv.QUOTE_ALL)
        writerHuman = csv.writer(csvfile2, quoting=csv.QUOTE_ALL)
        
        ############ Main Generation Loop ############@
        for i in tqdm(range(start_index,end_index)):
            
            if ModelName == "v36_Naive":
                prompt = PromptList[i]
            else:
                prompt = "<|Title|>" + PromptList[i] + " <|Step 1|>"

                
            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

            Set_Sampling_parameters(0.9,50,1,1) 
            
            output = model.generate(
              input_ids, 
              max_length=100,
              do_sample=True,
              top_k = Sampling_parameters["top_k"],
              top_p = Sampling_parameters["top_p"],
              repetition_penalty = Sampling_parameters["repetition_penalty"],
              temperature=Sampling_parameters["temperature"],
              pad_token_id=tokenizer.eos_token_id,
              early_stopping=True
            )

            #only pour l'ancien model homer 
            if ModelName == "homerGPT2":
                generated = regularizeDot(tokenizer.decode(output[0], skip_special_tokens=True))
            else:
                generated = tokenizer.decode(output[0], skip_special_tokens=True)

            MachineStories.append(generated)
            writerMachine.writerow([generated])
            writerHuman.writerow([HumanStories[i]])
            completed = 1
        ############ End Loop & Close file ############@
    

    IsNotInterrupted = 1
       

  5%|▍         | 22/464 [14:29<4:44:14, 38.58s/it]

In [None]:
if do_generate:
    Gentime = time.time()- StartTime 
    updateResultData(ResultData, CurrentResultIdx, "Samples", i-start_index + IsNotInterrupted +AccumulatedSamples)
    updateResultData(ResultData, CurrentResultIdx, "Gentime", Gentime + AccumulatedGentime)

In [None]:
SaveResultData()

In [None]:
print(ResultData[CurrentResultIdx])

# Eval

In [None]:
if do_mauve:
    
    metric = load_metric('mauve')

    out = metric.compute(predictions=MachineStories, references=HumanStories)
    print(out.mauve)
    
    MauvTime = time.time() - GenTime - StartTime
    
    updateResultData(ResultData, CurrentResultIdx, "mauve", out.mauve)
    updateResultData(ResultData, CurrentResultIdx, "MauvTime", MauvTime)
    
    print(ResultData[CurrentResultIdx])

    SaveResultData()
    

    