In [1]:
import json

import numpy as np

from tqdm import tqdm
from datasets import load_dataset
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
gsm8k = load_dataset('gsm8k', 'socratic')

In [4]:
gsm8k

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [7]:
gsm8k['train'][0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nHow many clips did Natalia sell altogether in April and May? ** Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [20]:
alpaca_prompt_gsm8k = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: {question}

### Response:
Answer: {answer}
"""

def formatting_prompts_func_train(examples):
  
  questions = examples["question"]
  answers = examples["answer"]
  texts = []
  for question, answer in zip(questions, answers):
    # must add EOS_TOKEN, otherwise the generation never stops 
    text = alpaca_prompt_gsm8k.format(question=question, answer=answer) #+ EOS_TOKEN
    texts.append(text)
  return {'text': texts}


def formatting_prompts_func_test(examples):
  
  questions = examples["question"]
  texts = []
  for question in questions:
    # must add EOS_TOKEN, otherwise the generation never stops 
    text = alpaca_prompt_gsm8k.format(question=question, answer="") #+ EOS_TOKEN
    texts.append(text)
  return {'text': texts}
dataset = load_dataset('gsm8k', 'socratic', split='train')
dataset = dataset.map(formatting_prompts_func_train, batched=True)

dataset_test = load_dataset('gsm8k', 'socratic', split='test')
dataset_test = dataset_test.map(formatting_prompts_func_test, batched=True)

Using the latest cached version of the dataset since gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'socratic' at C:\Users\ellen\.cache\huggingface\datasets\gsm8k\socratic\0.0.0\e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Tue Apr 30 21:07:35 2024).
Map: 100%|██████████| 7473/7473 [00:00<00:00, 52571.61 examples/s]
Using the latest cached version of the dataset since gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'socratic' at C:\Users\ellen\.cache\huggingface\datasets\gsm8k\socratic\0.0.0\e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Tue Apr 30 21:12:27 2024).
Map: 100%|██████████| 1319/1319 [00:01<00:00, 821.36 examples/s]


In [21]:
dataset[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nHow many clips did Natalia sell altogether in April and May? ** Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n\n### Instruction:\nYou are a helpful math teacher.\n\n### Input:\nQuestion: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\n\n### Response:\nAnswer: How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nHow many clips did Natalia sell altogether in April and 

In [22]:
dataset_test[0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': "How many eggs does Janet sell? ** Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nHow much does Janet make at the farmers' market? ** She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18",
 'text': "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n\n### Instruction:\nYou are a helpful math teacher.\n\n### Input:\nQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much i

In [23]:
dataset_test['text'][0]

"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n\n### Instruction:\nYou are a helpful math teacher.\n\n### Input:\nQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\n\n### Response:\nAnswer: \n"

In [45]:
import os
import pandas as pd
from datasets import load_dataset
data_path = "kaggle_automated_essay_scoring_2/"
data_file = os.path.join(data_path, "train.csv")

dataset = load_dataset("csv", data_files=data_file, split='train')





In [46]:
dataset

Dataset({
    features: ['essay_id', 'full_text', 'score'],
    num_rows: 17307
})

In [59]:
from datasets import load_dataset
# EOS_TOKEN = tokenizer.eos_token

alpaca_prompt_essay_score = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are an English teacher. Given the following essay, you give a score.

### Input:
{input}

### Response:
{output}
"""

def formatting_prompts_func_train(examples):
  
  essay = examples["full_text"]
  score = examples["score"]
  texts = []
  for essay, score in zip(essay, score):
    # must add EOS_TOKEN, otherwise the generation never stops
    text = alpaca_prompt_essay_score.format(input=essay, output=f"Score: {score}") #+ EOS_TOKEN
    texts.append(text), 
  return {'text': texts}


def formatting_prompts_func_test(examples):
  
  essay = examples["full_text"]
  score = examples["score"]
  texts = []
  for essay, score in zip(essay, score):
    # must add EOS_TOKEN, otherwise the generation never stops
    text = alpaca_prompt_essay_score.format(input=essay, output=f"Score:") 
    texts.append(text), 
  return {'text': texts}

data_path = "kaggle_automated_essay_scoring_2/"
data_file = os.path.join(data_path, "train.csv")

dataset = load_dataset("csv", data_files=data_file, split='train')

train_test_split = dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% test

# Access the train and test sets
dataset_train = train_test_split['train']
dataset_test = train_test_split['test']
dataset_train = dataset_train.map(formatting_prompts_func_train, batched=True, batch_size=len(dataset_train))
dataset_test = dataset_test.map(formatting_prompts_func_test, batched=True, batch_size=len(dataset_train))


Map: 100%|██████████| 13845/13845 [00:00<00:00, 32109.33 examples/s]
Map: 100%|██████████| 3462/3462 [00:00<00:00, 29113.68 examples/s]


In [60]:
dataset_train[0]

{'essay_id': 'ea26dc4',
 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a presiden

In [61]:
dataset_test[0]

{'essay_id': '44eff2d',
 'full_text': 'Based on the many differences of positive and negative comments towards the use of driverless cars, I wish to pick neither side in the disscusion.\n\nI believe there are many reasons that driverless cars should take over the roads, but a handfull of reasons that driverless cars on our roads today would lead to dissaster.\n\nTo start off, there was a point made in paragraph one, where the author tells us that new smart cars or driverless cars would use half of the gas as todays taxis.\n\nWith this, there could be many things that move in a positive direction.\n\nFor example, if cars are running on half of the ammount of fuel that todays taxis use, then there is no doubt about the air polution in our cities and other countries being limmitted.\n\nFor years people have been trying to find ways to limit the air polution around us for future generations.\n\nThey want their kids to grow up in a health environment.\n\nHere is one way to accomplish those 

In [56]:
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% test

# Access the train and test sets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']
dataset_train = dataset.map(formatting_prompts_func_train, batched=True, batch_size=len(dataset))

In [57]:
train_dataset

Dataset({
    features: ['essay_id', 'full_text', 'score', 'text'],
    num_rows: 13845
})

In [63]:
response = """
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are an English teacher. Given the following essay, you give a score.

### Input:
Based on the many differences of positive and negative comments towards the use of driverless cars, I wish to pick neither side in the disscusion.

I believe there are many reasons that driverless cars should take over the roads, but a handfull of reasons that driverless cars on our roads today would lead to dissaster.

To start off, there was a point made in paragraph one, where the author tells us that new smart cars or driverless cars would use half of the gas as todays taxis.

With this, there could be many things that move in a positive direction.

For example, if cars are running on half of the ammount of fuel that todays taxis use, then there is no doubt about the air polution in our cities and other countries being limmitted.

For years people have been trying to find ways to limit the air polution around us for future generations.

They want their kids to grow up in a health environment.

Here is one way to accomplish those disires.

However, if people want to start driving driverless cars, then the roads could become a lot more dangerous.

Yes there may be a decrease in the possibility for human error, but do people really trust computers, that break, to do a better job at a human job, than a human?

One of the big "no's" in using driverless cars is the cost.

As mentioned in paragraph three, the first step was seeing if modifying the roads would be a better option.

In my opinion, altering the roads to be formed into some kind of track would be the right way to go.

There is less of an opourtunity for collisions to happen if cars are on a track that has a certain path.

However, this is not an option because of the money.

It was said that "These smart-road systems worked surprisingly well, but they required massive upgrades to existing roads, something that was simply too expensive to be practical."

Also, if smart-cars were to be made a universal product allowed on the roads, how much would they cost?

Who is to say they arent just as outrageously priced as the roads.

The author mentions almost everything about these driverless cars in his paper, except the cost of them.

With my experience I am driven to believe that the reasoning for this is the cars are so expensive that nobody would buy them anyways.

You could read the whole paper and then see the price and you wouldn't care if the cars drove themselves because you wouldn't be able to afford it anyways.

"Why would anyone want a driverless car that still needs a driver?"

This sentence from paragraph eight is the basis for all driverless cars.

Supposedly, the car, when needing to get through tough places or navigate around construction, will alert the driver either by a vibrating seat or simply a voice command.

Sure this could be great!

You could let your car drive you around while you sit in the seat and text your friend or eat some food, but when doing so you are still putting yourself under a hunge risk.

If your car needs you to take over and your hands are covered in katchup and mustard from a hot dog you just ate and youre on youre phone and not paying attention, you are going to have a hard time reacting.

Where as if you were just driving the car, youre reaction time will be very fast and you will know what is oing on before it happens.

If youre in a driverless car and it swerves, you may grab the wheel, but have no idea where you are on the road or anything because you were too buys eating your hotdog and texting.

I believe that there are both good things and bad things that could come from having driverless cars.

For every good aspect of them, there is one equally as bad.

### Response:
Score:
3
Reasoning:
The author does a good job of explaining both sides of the argument, but in the end he does not pick a side.

### Instruction:
You are an English teacher. Given the following essay, you give a score.

### Input:
The author of the article "Driverless Cars" does a good job of explaining both sides of the argument, but in the end he does not pick a side.

The author does a good job of explaining both sides of the argument, but in the end he does not pick a side.

The author does a good job of explaining both sides of the argument, but in the end
"""

import re 
# find number followed by ### Response:\n\nScore:\n\n

def extrat_score(responses):
    if type(responses) != list:
        responses = [responses]
    scores = []
    pattern = re.compile(r'### Response:\s+Score:\s*(\d)')
    for r in responses:
        match = re.search(pattern, r)
        score = match.group(1) if match else None
        scores.append(score)

    return scores




score = extrat_score(response)
score

'3'

In [65]:
sample

{'essay_id': '44eff2d',
 'full_text': 'Based on the many differences of positive and negative comments towards the use of driverless cars, I wish to pick neither side in the disscusion.\n\nI believe there are many reasons that driverless cars should take over the roads, but a handfull of reasons that driverless cars on our roads today would lead to dissaster.\n\nTo start off, there was a point made in paragraph one, where the author tells us that new smart cars or driverless cars would use half of the gas as todays taxis.\n\nWith this, there could be many things that move in a positive direction.\n\nFor example, if cars are running on half of the ammount of fuel that todays taxis use, then there is no doubt about the air polution in our cities and other countries being limmitted.\n\nFor years people have been trying to find ways to limit the air polution around us for future generations.\n\nThey want their kids to grow up in a health environment.\n\nHere is one way to accomplish those 

In [66]:
results = []
for idx, sample in enumerate(dataset_test):
    # print(idx, sample['score'], sample['text'])
    text = sample['text']
    score = sample['score']
    inputs = tokenizer([
        text
    ], return_tensors='pt').to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    outputs = tokenizer.batch_decode(outputs)
    predictions = extrat_score(outputs)
    result = {k: v for k, v in sample.items()}
    result['prediction'] = predictions[0]
    results.append(result)

destination_dir = '/content/drive/MyDrive/Colab Notebooks/unsloth/Llama3'

import json 
with open(os.path.join(destination_dir, 'auto-scoring', 'results.json'), 'w') as f:
    json.dump(results, f)

correct_count = 0   
for r in results:
    if r['prediction'] and int(r['prediction']==r['score']):
        correct_count += 1
print(f"rate: {correct_count/len(results)}")

# date time suffix
from datetime import datetime 
suffix = datetime.now().strftime("%Y%m%d-%H%M%S")
# write rate into file
with open(os.path.join(destination_dir, 'auto-scoring', f'experiment_{suffix}.txt')) as f:
    f.write(f"rate: {correct_count/len(results)}")
    f.write('\n')


0 3 Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are an English teacher. Given the following essay, you give a score.

### Input:
Based on the many differences of positive and negative comments towards the use of driverless cars, I wish to pick neither side in the disscusion.

I believe there are many reasons that driverless cars should take over the roads, but a handfull of reasons that driverless cars on our roads today would lead to dissaster.

To start off, there was a point made in paragraph one, where the author tells us that new smart cars or driverless cars would use half of the gas as todays taxis.

With this, there could be many things that move in a positive direction.

For example, if cars are running on half of the ammount of fuel that todays taxis use, then there is no doubt about the air polution in our cities and other countries being l

In [67]:
count = 0
for sample in dataset_test:
    text = sample['text']
    score = sample['score']
    print(text)
    print(score)
    count += 1
    if count > 1:
        break



Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are an English teacher. Given the following essay, you give a score.

### Input:
Based on the many differences of positive and negative comments towards the use of driverless cars, I wish to pick neither side in the disscusion.

I believe there are many reasons that driverless cars should take over the roads, but a handfull of reasons that driverless cars on our roads today would lead to dissaster.

To start off, there was a point made in paragraph one, where the author tells us that new smart cars or driverless cars would use half of the gas as todays taxis.

With this, there could be many things that move in a positive direction.

For example, if cars are running on half of the ammount of fuel that todays taxis use, then there is no doubt about the air polution in our cities and other countries being limmi

In [68]:
from collections import Counter 
score_freq = Counter(dataset_test['score'])
score_freq

Counter({3: 1295, 2: 948, 4: 770, 1: 240, 5: 183, 6: 26})

In [74]:
import re
pattern = re.compile(r'^[0-9a-zA-Z]+$')
pattern.match('abc;')

In [76]:
s = 'abc123'
for c in s:
    print(f'{c.isalpha(), c.isdigit()}')

(True, False)
(True, False)
(True, False)
(False, True)
(False, True)
(False, True)


In [79]:
import ollama

In [80]:
ollama.list()

{'models': [{'name': 'llama2:latest',
   'model': 'llama2:latest',
   'modified_at': '2024-02-19T20:11:26.161604+11:00',
   'size': 3826793677,
   'digest': '78e26419b4469263f75331927a00a0284ef6544c1975b826b15abdaef17bb962',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '7B',
    'quantization_level': 'Q4_0'}}]}

In [2]:
from ollama import Client
client = Client(host='http://localhost:11434')
response = client.chat(model='llama3', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])

In [3]:
response

{'model': 'llama3',
 'created_at': '2024-06-02T04:32:18.4086578Z',
 'message': {'role': 'assistant',
  'content': "What a great question!\n\nThe sky appears blue to our eyes because of a phenomenon called scattering. Scattering occurs when sunlight interacts with tiny particles in the atmosphere, such as gases like nitrogen and oxygen, as well as aerosols like dust, water vapor, and pollutants.\n\nHere's what happens:\n\n1. When sunlight enters Earth's atmosphere, it encounters these tiny particles.\n2. The shorter (blue) wavelengths of light are scattered more than the longer (red) wavelengths because they have a stronger interaction with the particles. This is known as Rayleigh scattering, named after the British physicist Lord Rayleigh who first described the phenomenon in the late 19th century.\n3. As a result of this scattering, the blue light is dispersed throughout the atmosphere, reaching our eyes from all directions.\n4. Our brains perceive this scattered blue light as the col

In [1]:
import requests
import json

# Define the URL and payload
url = "http://localhost:11434/api/chat"
payload = {
    "model": "llama3",
    "messages": [
        {
            "role": "user",
            "content": "why is the sky blue?"
        }
    ],
    "stream": False
}

# Set the headers
headers = {
    "Content-Type": "application/json"
}

# Send the POST request
response = requests.post(url, headers=headers, data=json.dumps(payload))

# Print the response
if response.status_code == 200:
    print("Response:", response.json())
else:
    print("Failed to get a response. Status code:", response.status_code)
    print("Response body:", response.text)

Response: {'model': 'llama3', 'created_at': '2024-06-02T04:29:54.9568812Z', 'message': {'role': 'assistant', 'content': "What a great question!\n\nThe short answer: The sky appears blue because of a phenomenon called scattering, which is how sunlight interacts with tiny molecules of gases in our atmosphere.\n\nHere's a more detailed explanation:\n\n1. **Sunlight**: When the sun emits light, it includes all colors of the visible spectrum (red, orange, yellow, green, blue, indigo, and violet).\n2. **Atmosphere**: As sunlight travels through space to reach Earth, it encounters tiny molecules of gases like nitrogen (N2) and oxygen (O2). These molecules are much smaller than the wavelength of light.\n3. **Scattering**: When sunlight hits these gas molecules, they scatter the shorter (blue) wavelengths more efficiently than the longer (red) wavelengths. This is because the small molecule size allows them to deflect the shorter wavelengths more effectively.\n4. **Blue dominance**: As a result