# Pre-trained model: Microsoft/phi-2

### Loading & cleaning

In [1]:

# Imports
import os
import re
import pandas as pd
from functions import *

# Load data
data = '../data/ready_for_phi-2/'

print(os.path.exists(data))
print(os.listdir(data))

True
['eval', 'test_01.csv', 'test_02.csv', 'train_01.csv', 'train_02.csv']


In [2]:
# Load
train10k = pd.read_csv(data + 'train_01.csv')
train5k = pd.read_csv(data + 'train_02.csv')

test2k = pd.read_csv(data + 'test_01.csv')
test1k = pd.read_csv(data + 'test_02.csv')

train10k.shape, train5k.shape, test2k.shape, test1k.shape

((9999, 7), (4998, 7), (1998, 7), (999, 7))

In [3]:
# Selecting only the columns we need for fine-tuning

# 15k training samples
train10k = train10k[['label', 'prompt', 'response']]
train5k = train5k[['label', 'prompt', 'response']]

# 3k test samples
test2k = test2k[['label', 'prompt', 'response']]
test1k = test1k[['label', 'prompt', 'response']]

# Rename columns
train10k.columns = ['label', 'input', 'mistral_output']
train5k.columns = ['label', 'input', 'mistral_output']
test2k.columns = ['label', 'input', 'mistral_output']
test1k.columns = ['label', 'input', 'mistral_output']

# Check
train10k.shape, train5k.shape, test2k.shape, test1k.shape

((9999, 3), (4998, 3), (1998, 3), (999, 3))

In [4]:
dfs = [train10k, train5k, test2k, test1k]

In [5]:
for df in dfs:
  print(df.duplicated().sum())

0
0
0
0


In [6]:
# remove INST tags
for df in dfs:
  df['input'] = df['input'].str.replace('[INST]', '')
  df['input'] = df['input'].str.replace('[/INST]', '')
  df['mistral_output'] = df['mistral_output'].str.replace('[INST]', '')
  df['mistral_output'] = df['mistral_output'].str.replace('[/INST]', '')

In [7]:
train10k.head()

Unnamed: 0,label,input,mistral_output
0,2,A customer left us a 2-star review: 'the cash...,A customer left us a 2-star review: 'the cashi...
1,2,A customer left us a 2-star review: 'here's w...,A customer left us a 2-star review: 'here's wh...
2,2,A customer left us a 2-star review: 'went to ...,A customer left us a 2-star review: 'went to t...
3,2,A customer left us a 2-star review: 'went to ...,A customer left us a 2-star review: 'went to c...
4,2,A customer left us a 2-star review: 'a whole ...,A customer left us a 2-star review: 'a whole l...


In [8]:
for df in dfs:
  df['input'] = df['input'].str.strip()
  df['mistral_output'] = df['mistral_output'].str.strip()

In [9]:
def clean_output(input_text, output_text):
    # Normalize whitespace
    input_norm = re.sub(r'\s+', ' ', input_text.strip())
    output_norm = re.sub(r'\s+', ' ', output_text.strip())

    # Escape special regex characters in input text
    pattern = re.escape(input_norm)

    # Remove input from output using regex (case-insensitive)
    cleaned_output = re.sub(pattern, '', output_norm, flags=re.IGNORECASE).strip()

    # Final cleanup: remove extra spaces
    return re.sub(r'\s+', ' ', cleaned_output).strip()

In [10]:
# Apply this function
for df in dfs:
    df['mistral_output'] = df.apply(lambda row: clean_output(row['input'], row['mistral_output']), axis=1)

In [11]:
train10k.head()

Unnamed: 0,label,input,mistral_output
0,2,A customer left us a 2-star review: 'the cashi...,To improve the customer's experience and addre...
1,2,A customer left us a 2-star review: 'here's wh...,"To improve the customer's experience, the foll..."
2,2,A customer left us a 2-star review: 'went to t...,"To improve the customer's experience, we can f..."
3,2,A customer left us a 2-star review: 'went to c...,"To improve the customer's experience, we could..."
4,2,A customer left us a 2-star review: 'a whole l...,"To improve the customer's experience, we can a..."


### Initialize the `phi-2` in 8-bit

In [12]:
import torch
import gc
import pickle
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb

gc.collect()
torch.cuda.empty_cache()

In [13]:

# Define 8-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

# Load model and tokenizer
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'


model = AutoModelForCausalLM.from_pretrained(model_name,
                                             config=bnb_config,
                                             device_map="auto",
                                             offload_folder="offload_folder",
                                             offload_buffers=True)

# Get model device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"==========\n! Model loaded on: {device}\n")
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


! Model loaded on: cuda



PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((256

##### Testing Inference

In [14]:
# Prepare input text
text_test = train10k['input'][11]
inputs = tokenizer(text_test, return_tensors="pt").to(device)  # Move input tensors to the model's device

# Output
with torch.no_grad():
  output = model.generate(**inputs,
                          max_new_tokens=200,
                          pad_token_id=tokenizer.eos_token_id)
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
  
print(decoded_output)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


A customer left us a 2-star review: 'totally dont uunderstand this place........at all....! n nare you a mexican restaurant....are you a pub......what are you? the food is ok the wine is questionable for the price ther service is ok. n nits just a bit pointless great location going to waste in my opinion.' The customer feels disapproval, annoyance, confusion. Concisely, how can we best improve our services for this customer's experience?

<|Question|>
1. What are the customer's feelings and thoughts about the restaurant?
2. What could be the possible reasons for the customer's dissatisfaction?
3. How can the restaurant improve its services based on the customer's feedback?

<|Answer|>
1. The customer is dissatisfied and confused. They feel that the restaurant is not what they expected, and they are unsure about its identity. They are also annoyed by the perceived lack of understanding from the staff.
2. The customer might be dissatisfied due to the restaurant's unclear identity, the qu

In [15]:
# Call model once
def generate_1response(input_col):
    # Create prompt
    prompt = input_col
    
    # Tokenize the input prompt and move to the model's device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output using the model
    with torch.no_grad():
        outputs = model.generate(**inputs,
                                 max_new_tokens=200,
                                 pad_token_id=tokenizer.eos_token_id
                                 )

    # Decode the output and remove the prompt part from the response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean response
    response = clean_response(generated_text)
    
    return response

In [16]:
print(generate_1response(train10k['input'][11]))

A customer left us a 2-star review: 'totally dont uunderstand this place........at all....! n nare you a mexican restaurant....are you a pub......what are you? the food is ok the wine is questionable for the price ther service is ok. n nits just a bit pointless great location going to waste in my opinion.' The customer feels disapproval, annoyance, confusion. Concisely, how can we best improve our services for this customer's experience?

<|Question|>
1. What are the customer's feelings and thoughts about the restaurant?
2. What could be the possible reasons for the customer's dissatisfaction?
3. How can the restaurant improve its services based on the customer's feedback?

<|Answer|>
1. The customer is dissatisfied and confused. They feel that the restaurant is not what they expected, and they are unsure about its identity. They are also annoyed by the perceived lack of understanding from the staff.
2. The customer might be dissatisfied due to the restaurant's unclear identity, the qu

#### Batch Inference

In [17]:
# Define function to generate responses
def generate_response(prompts, batch_size=8, checkpoint_path='inference_checkpoint.pkl'):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    all_responses = [""] * len(prompts)  # Pre-allocate list
    start_batch = 0
    prompts = list(prompts)

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        try:
            with open(checkpoint_path, 'rb') as f:
                checkpoint = pickle.load(f)
                all_responses = checkpoint['responses']
                start_batch = checkpoint['next_batch']
                print(f"Resuming from checkpoint: {start_batch * batch_size}/{len(prompts)} items processed")
        except Exception as e:
            print(f"Error loading checkpoint: {e}. Starting fresh...")
            all_responses = [""] * len(prompts)
            start_batch = 0

    try:
        for i in tqdm(range(start_batch * batch_size, len(prompts), batch_size), desc="Processing batches"):
            batch_prompts = prompts[i:i+batch_size]
            batch_idx = i // batch_size  # Track batch number

            try:
                inputs = tokenizer(batch_prompts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=512, 
                                   return_tensors="pt").to(device)

                with torch.no_grad():
                    outputs = model.generate(**inputs, 
                                             max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)

                batch_responses = []
                for j, output in enumerate(outputs):
                    try:
                        generated_text = tokenizer.decode(output, skip_special_tokens=True)
                        response = clean_response(generated_text)  # Clean response
                    except Exception as e:
                        response = "ERROR_GENERATING_RESPONSE"
                        print(f"Error processing response {i+j}: {e}")

                    batch_responses.append(response)

                # Assign responses to correct indices
                all_responses[i:i+batch_size] = batch_responses

                # Save progress to checkpoint (AFTER a batch is completed)
                checkpoint = {'responses': all_responses, 'next_batch': batch_idx + 1}
                with open(checkpoint_path, 'wb') as f:
                    pickle.dump(checkpoint, f)

                # Free GPU memory
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    gc.collect()

            except Exception as e:
                print(f"Error in batch {batch_idx}: {e}")
                
                # Assign error responses only to the failed batch
                all_responses[i:i+batch_size] = ["ERROR_GENERATING_RESPONSE"] * len(batch_prompts)

                # Save progress to checkpoint even if batch fails
                checkpoint = {'responses': all_responses, 'next_batch': batch_idx + 1}
                with open(checkpoint_path, 'wb') as f:
                    pickle.dump(checkpoint, f)

    except KeyboardInterrupt:
        print("\nProcess interrupted. Saving progress...")
        with open(checkpoint_path, 'wb') as f:
            pickle.dump({'responses': all_responses, 'next_batch': i // batch_size + 1}, f)

    # Fill any missing responses
    if "" in all_responses:
        all_responses = ["ERROR_GENERATING_RESPONSE" if r == "" else r for r in all_responses]

    # Remove checkpoint if processing completes successfully
    if len(all_responses) == len(prompts) and os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        print("Processing completed, checkpoint removed.")

    return all_responses

### Model output before fine-tuning

In [18]:
# Stratified sampling on train10k, I need 1k samples to have some sort of Benchmark for the non-fine-tuned phi-2 model. I'll keep them in the train10k dataframe.
from sklearn.model_selection import train_test_split

_, train10k_sample = train_test_split(train10k, test_size=0.1, 
                                      random_state=42, 
                                      stratify=train10k['label'])
train10k_sample.shape, train10k_sample['label'].value_counts()

((1000, 3),
 label
 3    334
 1    333
 2    333
 Name: count, dtype: int64)

In [21]:
# Freeing up some memory
gc.collect()
torch.cuda.empty_cache()

In [22]:
train10k_sample['phi-2_output'] = generate_response(train10k_sample['input'],
                                                    batch_size=5)

train10k_sample.head()

Processing batches:   0%|          | 0/200 [00:00<?, ?it/s]

Processing completed, checkpoint removed.


Unnamed: 0,label,input,mistral_output,phi-2_output
5382,1,A customer left us a 1-star review: 'i had the...,"To improve the customer's experience, we can t...",A customer left us a 1-star review: 'i had the...
9236,3,A customer left us a 3-star review: 'i've been...,"To improve the customer's experience, consider...",A customer left us a 3-star review: 'i've been...
6735,3,A customer left us a 3-star review: 'a problem...,"To improve the customer's experience, consider...",A customer left us a 3-star review: 'a problem...
6123,1,A customer left us a 1-star review: 'i'd never...,"To improve the customer's experience, we can f...",A customer left us a 1-star review: 'i'd never...
1627,2,A customer left us a 2-star review: 'oh rue 21...,"To improve the customer's experience at Rue21,...",A customer left us a 2-star review: 'oh rue 21...


In [23]:
# Save sample
eval_path = '../data/ready_for_phi-2/eval'
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

In [24]:
train10k_sample.to_csv(eval_path + '/train10k_sample.csv', index=False)

if os.path.exists(eval_path + '/train10k_sample.csv'):
    print("Sample saved successfully.")

Sample saved successfully.


### Eval