# Representation

In [1]:
# Imports
import os
import pandas as pd

# Load data
save = "../data/for_step_2/"

print(os.path.exists(save))
print(os.listdir(save))

True
['test_samples_1.csv', 'test_samples_2.csv', 'train_samples_1.csv', 'train_samples_2.csv']


#### Loading our prepped Yelp dataset.

> We're now trying to get an output from *mistral 7B* that we can fine-tune *phi-2* with.
> - Read more about how this came about in the decision making process in the [project journal](../Project_Journal.md).

In [2]:
# Load train and test data
train10k = pd.read_csv(save + 'train_samples_1.csv')
train5k = pd.read_csv(save + 'train_samples_2.csv')
test2k = pd.read_csv(save + 'test_samples_1.csv')
test1k = pd.read_csv(save + 'test_samples_2.csv')

print('Train:', train10k.shape, train5k.shape)
print('Test:', test2k.shape, test1k.shape)

Train: (9999, 5) (4998, 5)
Test: (1998, 5) (999, 5)


#### Setting up `Mistral-7B-Instruct-v0.3` for inference

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb

torch.cuda.empty_cache()

In [4]:
# Define 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load in 4-bit to reduce VRAM usage
    bnb_4bit_compute_dtype=torch.float16,  # Use float16 for computations
    bnb_4bit_use_double_quant=True,  # Double quantization for further memory reduction
    bnb_4bit_quant_type="nf4",  # Use NF4 quantization (optimized for LLMs)
)

# Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Get the model's device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()  # Set to evaluation mode

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNo

In [5]:
# Print the current GPU being used
current_device = torch.cuda.current_device()
print(f"Using GPU: {current_device} - {torch.cuda.get_device_name(current_device)}")

print(f"PyTorch version: {torch.__version__}")
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Check your installation.")

# Print GPU memory usage
print(f"GPU memory allocated: {torch.cuda.memory_allocated()/1e6:.2f} MB")
print(f"GPU memory cached: {torch.cuda.memory_reserved()/1e6:.2f} MB")

Using GPU: 0 - NVIDIA GeForce RTX 3060
PyTorch version: 2.5.1+cu121
CUDA Available: True
CUDA device count: 1
CUDA device name: NVIDIA GeForce RTX 3060
GPU memory allocated: 4138.59 MB
GPU memory cached: 4160.75 MB


#### Testing Inference

In [47]:
# Prepare input text
text_test = "[INST] Explain why customer feedback is important for improving business processes. [/INST]"
inputs = tokenizer(text_test, return_tensors="pt").to(device)  # Move input tensors to the model's device

# Generate output
with torch.no_grad():
    outputs = model.generate(**inputs, 
                             max_new_tokens=150,
                             pad_token_id=tokenizer.eos_token_id)

# Decode and print the response
print(tokenizer.decode(outputs[0], 
                       skip_special_tokens=True))

Explain why customer feedback is important for improving business processes.  Customer feedback is crucial for improving business processes for several reasons:

1. Identifying Strengths and Weaknesses: Feedback provides insights into what the business is doing well and where it needs improvement. This information can help businesses focus on their strengths and address areas that need improvement.

2. Understanding Customer Needs: Feedback helps businesses understand their customers' needs, expectations, and preferences. This understanding can guide product development, service improvements, and marketing strategies.

3. Enhancing Customer Satisfaction: By addressing issues raised in feedback, businesses can improve their products or services, leading to increased customer satisfaction. Satisfied customers are more likely to remain loyal and recommend the business to others


In [48]:
# Prompt
def create_prompt(text, emotions, label):
    text =  f"A customer left us a {label}-star review: '{text}' The customer feels {emotions}. Concisely, how can we best improve our services for this customer's experience?"
    return f"[INST] {text} [/INST]"

In [49]:
# Test inference
sample_text = train10k['clean_text'][11]
sample_emotions = train10k['emotions'][11]
sample_stars = train10k['label'][11]

prompt = create_prompt(sample_text, sample_emotions, sample_stars)
print(prompt)

[INST] A customer left us a 2-star review: 'totally dont uunderstand this place........at all....! n nare you a mexican restaurant....are you a pub......what are you? the food is ok the wine is questionable for the price ther service is ok. n nits just a bit pointless great location going to waste in my opinion.' The customer feels disapproval, annoyance, confusion. Concisely, how can we best improve our services for this customer's experience? [/INST]


In [None]:
# Tokenize the input prompt and move to the model's device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate output using the model
with torch.no_grad():
    outputs = model.generate(**inputs,
                             max_new_tokens=180,
                             pad_token_id=tokenizer.eos_token_id)

# Decode the output and remove the prompt part from the response
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

response = generated_text

def clean_response(text):
    import re
    
    # Find last period
    last_period = text.rfind('.')
    if last_period == -1:
        return text
    
    # Get text up to last period
    cleaned = text[:last_period+1]
    
    # Check for numbered list pattern at the end (like "5." with nothing after it)
    pattern = r'\n\d+\.\s*$'
    cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

response = clean_response(response)
print("\nGenerated Response (without prompt):")
print(response)


Generated Response (without prompt):
To improve the customer's experience, we can take the following steps:

1. Clarify our establishment's identity: Ensure that our online presence clearly communicates our restaurant's concept, cuisine, and atmosphere to avoid confusion.

2. Enhance menu offerings: Review our menu to ensure a consistent quality of food and competitive pricing, especially for wine.

3. Improve service: Train our staff to provide friendly, attentive, and efficient service that meets customer expectations.

4. Address cleanliness: Ensure the restaurant is clean and free of any distractions, such as nits, to create a more pleasant dining experience.

5. Engage with the customer: Respond to the review, acknowledging their feedback and expressing our commitment to improving their experience in the future.


### Functions

In [6]:
# Create prompt # Redefining here for clarity, but same as above.
def create_prompt(text, emotions, label):
    text =  f"A customer left us a {label}-star review: '{text}' The customer feels {emotions}. Concisely, how can we best improve our services for this customer's experience?"
    return f"[INST] {text} [/INST]"

In [2]:
# Apply to all samples
train10k['prompt'] = train10k.apply(lambda x: create_prompt(x['clean_text'], x['emotions'], x['label']), axis=1)
train5k['prompt'] = train5k.apply(lambda x: create_prompt(x['clean_text'], x['emotions'], x['label']), axis=1)
test2k['prompt'] = test2k.apply(lambda x: create_prompt(x['clean_text'], x['emotions'], x['label']), axis=1)
test1k['prompt'] = test1k.apply(lambda x: create_prompt(x['clean_text'], x['emotions'], x['label']), axis=1)

# Print shapes
print('Train:', train10k.shape, train5k.shape)
print('Test:', test2k.shape, test1k.shape)

NameError: name 'train10k' is not defined

In [9]:
def clean_response(text):
    import re
    
    # Remove any occurrences of '</s>' (special token for end of sequence)
    text = text.replace("</s>", "").strip()
    
    # Find last period
    last_period = text.rfind('.')
    if last_period == -1:
        return text
    
    # Get text up to last period
    cleaned = text[:last_period + 1]
    
    # Check for numbered list pattern at the end (like "5." with nothing after it)
    pattern = r'\n\d+\.\s*$'
    cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

In [None]:
# Call model once
def generate_1response(input_col):
    # Create prompt
    prompt = input_col
    
    # Tokenize the input prompt and move to the model's device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output using the model
    with torch.no_grad():
        outputs = model.generate(**inputs,
                                 max_new_tokens=150,
                                 pad_token_id=tokenizer.eos_token_id
                                 )

    # Decode the output and remove the prompt part from the response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean response
    response = clean_response(generated_text)
    
    return response

In [11]:
# Test the function
print(generate_1response(train10k['prompt'][11]))

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


To improve the customer's experience, we can take the following steps:

1. Clarify our establishment's identity: Ensure that our online presence clearly communicates our restaurant's concept, cuisine, and atmosphere to avoid confusion.

2. Enhance menu offerings: Review our menu to ensure a consistent quality of food and competitive pricing, especially for wine.

3. Improve service: Train our staff to provide friendly, attentive, and efficient service that meets customer expectations.

4. Address cleanliness: Ensure the restaurant is clean and free of any distractions, such as nits, to create a more pleasant dining experience.


Batch

In [12]:
import gc
import pickle
from tqdm.auto import tqdm

In [None]:
# Define function to generate responses
def generate_response(prompts, batch_size=8, checkpoint_path='inference_checkpoint.pkl'):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    all_responses = [""] * len(prompts)  # Pre-allocate list
    start_batch = 0
    prompts = list(prompts)

    if os.path.exists(checkpoint_path):
        try:
            with open(checkpoint_path, 'rb') as f:
                checkpoint = pickle.load(f)
                all_responses = checkpoint['responses']
                start_batch = checkpoint['next_batch']
                print(f"Resuming from checkpoint: {start_batch * batch_size}/{len(prompts)} items processed")
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
            all_responses = [""] * len(prompts)
            start_batch = 0

    try:
        for i in tqdm(range(start_batch * batch_size, len(prompts), batch_size), desc="Processing batches"):
            batch_prompts = prompts[i:i+batch_size]
            batch_idx = i // batch_size

            try:
                inputs = tokenizer(batch_prompts, padding=True, truncation=True, return_tensors="pt").to(device)

                with torch.no_grad():
                    outputs = model.generate(**inputs, max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)

                batch_responses = []
                for j, output in enumerate(outputs):
                    generated_text = tokenizer.decode(output, skip_special_tokens=True)

                    # Clean the response using the cleaning function
                    response = clean_response(response)

                    # Append cleaned response
                    batch_responses.append(response)
                
                all_responses[i:i+batch_size] = batch_responses  # Assign responses in correct indices

                # Save progress to checkpoint
                checkpoint = {'responses': all_responses, 'next_batch': batch_idx + 1}
                with open(checkpoint_path, 'wb') as f:
                    pickle.dump(checkpoint, f)

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    gc.collect()

            except Exception as e:
                print(f"Error in batch {batch_idx}: {e}")
                all_responses[i:i+batch_size] = ["ERROR_GENERATING_RESPONSE"] * len(batch_prompts)

                checkpoint = {'responses': all_responses, 'next_batch': batch_idx + 1}
                with open(checkpoint_path, 'wb') as f:
                    pickle.dump(checkpoint, f)

    except KeyboardInterrupt:
        print("\nProcess interrupted. Saving progress...")
        with open(checkpoint_path, 'wb') as f:
            pickle.dump({'responses': all_responses, 'next_batch': i // batch_size + 1}, f)

    # If any responses are still empty, fill them with error messages
    if "" in all_responses:
        all_responses = ["ERROR_GENERATING_RESPONSE" if r == "" else r for r in all_responses]  # Fill missing responses

    # Remove checkpoint file after processing completes
    if len(all_responses) == len(prompts) and os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        print("Processing completed, checkpoint removed")

    return all_responses

## Full inference

In [14]:
# Refresher
print('Train:', train10k.shape, train5k.shape)
print('Test:', test2k.shape, test1k.shape)

Train: (9999, 6) (4998, 6)
Test: (1998, 6) (999, 6)


In [18]:
# Save the chunks
chunks = '../data/chunks/'
if not os.path.exists(chunks):
    os.makedirs(chunks)

### Part 1 <span style="font-size:12px;">Close this for less scrolling...</span>

In [None]:
# Split train10k into 10 chunks of ~1000 records each
train1 = train10k[:1000].copy()
train2 = train10k[1000:2000].copy()
train3 = train10k[2000:3000].copy()
train4 = train10k[3000:4000].copy()
train5 = train10k[4000:5000].copy()
train6 = train10k[5000:6000].copy()
train7 = train10k[6000:7000].copy()
train8 = train10k[7000:8000].copy()
train9 = train10k[8000:9000].copy()
train10 = train10k[9000:].copy()

# Split 2k test into 2 chunks of ~1000 records each
test1 = test2k[:1000].copy()
test2 = test2k[1000:].copy()

# Not the most efficient way, but it works...

##### Train 10k

In [None]:
train1['response'] = generate_response(train1['prompt'], batch_size=50)
train1.to_csv(chunks + 'train1.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing completed, checkpoint removed
e customer's experience and address their concerns, we can take the following steps:

1. Acknowledge the issue: Respond to the review by acknowledging the customer's frustration and apologizing for the poor experience they had with our cashier.

2. Investigate the incident: Speak with the cashier involved to understand the situation from their perspective and determine if there was a misunderstanding or if the cashier was indeed rude.

3. Provide training: Offer additional training to the cashier on customer service skills, focusing on communication, empathy, and problem-solving.


In [None]:
train2['response'] = generate_response(train2['prompt'], batch_size=50)
train2.to_csv(chunks + 'train2.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
e customer's experience, we can focus on the following areas:

1. Improve Thai Cuisine: The customer mentioned that the Thai food was not up to par. We could consider hiring a Thai chef or providing additional training for our current staff to improve the authenticity and quality of our Thai dishes.

2. Address Pricing Concerns: The customer felt that the prices were too high. We could review our pricing strategy and ensure that it is competitive with other establishments in the Bay Area.

3. Highlight German Wines: The customer appreciated the selection of German wines.


In [None]:
train3['response'] = generate_response(train3['prompt'], batch_size=50)
train3.to_csv(chunks + 'train3.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
e customer's experience, we can focus on the following areas:

1. Improve Drive-Thru Service: Train staff to be more friendly and welcoming, as the customer felt the service was a bit brusque.

2. Enhance Food Packaging: Investigate why the hot dogs are coming apart in the wrappers, causing mustard to spill. This could involve changing the type of wrapper or improving the way the food is wrapped.

3. Quality Control: Ensure that the hot dogs are properly prepared and not overly wet or juicy, which could contribute to the mustard spilling.


In [None]:
train4['response'] = generate_response(train4['prompt'], batch_size=50)
train4.to_csv(chunks + 'train4.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
e customer's experience, we can focus on enhancing the quality and consistency of our food, specifically the fries, burgers, and sauces. We can aim to make them more flavorful, crispy, and moist, respectively. Additionally, we can work on improving the presentation of our food to make it more appealing.

Regarding the customer's comment about the packaging, we can ensure that any religious or personal messages are not visible on the packaging. This will help maintain a neutral and welcoming environment for all customers.

Lastly, we can strive to provide exceptional customer service, addressing any concerns or feedback promptly and professionally.


In [None]:
train5['response'] = generate_response(train5['prompt'], batch_size=50)
train5.to_csv(chunks + 'train5.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
e customer's experience at your Panda Express location, consider the following steps:

1. Freshness: Ensure that food is prepared and served as fresh as possible. This may involve adjusting cooking times or procedures to maintain quality.

2. Service: Train staff to provide friendly, efficient, and attentive service. This includes greeting customers promptly, addressing any concerns or questions they may have, and maintaining a clean and organized dining area.

3. Cleanliness: Regularly clean and sanitize all areas of the restaurant, including kitchen equipment, dining tables, and restrooms. This will help to maintain a hygienic environment and improve the overall dining experience.


In [None]:
train6['response'] = generate_response(train6['prompt'], batch_size=50)
train6.to_csv(chunks + 'train6.csv', index=False)

NameError: name 'generate_response' is not defined

In [None]:
train7['response'] = generate_response(train7['prompt'], batch_size=50)
train7.to_csv(chunks + 'train7.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
6000    e customer's experience, we can take the follo...
6001    e customer's experience, we can focus on the f...
6002    liability: Ensure timely delivery of orders, e...
6003    e customer's experience, we can take the follo...
6004    e customer's experience, we can focus on the f...
Name: response, dtype: object


In [None]:
train8['response'] = generate_response(train8['prompt'], batch_size=50)
train8.to_csv(chunks + 'train8.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
7000    e customer's experience, we can focus on the f...
7001    e customer's experience, we can focus on the f...
7002    e customer's experience, we can focus on the f...
7003    e customer's experience, we could consider the...
7004    e customer's experience at the Tempe location,...
Name: response, dtype: object


In [None]:
train9['response'] = generate_response(train9['prompt'], batch_size=50)
train9.to_csv(chunks + 'train9.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
8000    e customer's experience, consider staffing mor...
8001    e customer's experience, we can focus on the f...
8002    e customer's experience, we can focus on the f...
8003    e customer's experience, we could consider the...
8004    e customer's experience, consider the followin...
Name: response, dtype: object


In [None]:
train10['response'] = generate_response(train10['prompt'], batch_size=50)
train10.to_csv(chunks + 'train10.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
9000    e customer's experience, we could consider exp...
9001    e customer's experience, it would be beneficia...
9002    e customer's experience, consider implementing...
9003    e customer's experience, we could focus on the...
9004    e customer's experience, we can focus on the f...
Name: response, dtype: object


#### Test 2k

In [None]:
test1['response'] = generate_response(test1['prompt'], batch_size=50)
test1.to_csv(chunks + 'test1.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
e customer's experience, we can focus on the following areas:

1. Phone etiquette: Train staff to answer calls professionally, with a friendly and courteous tone. Standard phrases such as "please," "thank you," and "you're welcome" should be used consistently.

2. Multilingual support: Ensure that staff can communicate effectively with customers who speak languages other than English. Providing clear instructions on how to switch languages or offering bilingual staff can help improve the customer experience.

3. Order accuracy and timeliness: Improve the efficiency of the order process to ensure that orders are prepared and ready for pickup as promised.


In [None]:
test2['response'] = generate_response(test2['prompt'], batch_size=50)
test2.to_csv(chunks + 'test2.csv', index=False)

e customer's experience, we can focus on the following areas:

1. Phone etiquette: Train staff to answer calls professionally, with a friendly and courteous tone. Standard phrases such as "please," "thank you," and "you're welcome" should be used consistently.

2. Multilingual support: Ensure that staff can communicate effectively with customers who speak languages other than English. Providing clear instructions on how to switch languages or offering bilingual staff can help improve the customer experience.

3. Order accuracy and timeliness: Improve the efficiency of the order process to ensure that orders are prepared and ready for pickup as promised.


### Part 2 <span style="font-size:12px;">Close this for less scrolling...</span>

In [None]:
# Split train5k into 5 chunks of ~1000 records each
train11 = train5k[:1000].copy()
train12 = train5k[1000:2000].copy()
train13 = train5k[2000:3000].copy()
train14 = train5k[3000:4000].copy()
train15 = train5k[4000:].copy()

#### Train5k

In [None]:
train11['response'] = generate_response(train11['prompt'], batch_size=50)
train11.to_csv(chunks + 'train11.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
0    e customer's experience and address their conc...
1    e customer's experience, the following steps c...
2    e customer's experience, we can focus on the f...
3    e customer's experience, we could focus on mod...
4    e customer's experience, we can address the fo...
Name: response, dtype: object


In [None]:
train12['response'] = generate_response(train12['prompt'], batch_size=50)
train12.to_csv(chunks + 'train12.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
1000    e customer's experience, we can focus on the f...
1001    e taste of the Pad Thai: Enhance the flavor by...
1002    e customer's experience, we can take the follo...
1003    e customer's experience, we can focus on the f...
1004    e customer's experience, we can take the follo...
Name: response, dtype: object


In [None]:
train13['response'] = generate_response(train13['prompt'], batch_size=50)
train13.to_csv(chunks + 'train13.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
2000    e customer's experience, consider implementing...
2001    e customer's experience and potentially win th...
2002    e customer's experience and address their conc...
2003    e customer's experience and address their conc...
2004    e customer's experience, consider the followin...
Name: response, dtype: object


In [None]:
train14['response'] = generate_response(train14['prompt'], batch_size=50)
train14.to_csv(chunks + 'train14.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
3000    e customer's experience, we can take the follo...
3001    e customer's experience, we can focus on the f...
3002    e customer's experience, we can focus on the f...
3003    e customer's experience, we can take the follo...
3004    e customer's experience at your establishment,...
Name: response, dtype: object


In [None]:
train15['response'] = generate_response(train15['prompt'], batch_size=50)
train15.to_csv(chunks + 'train15.csv', index=False)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]

Processing completed, checkpoint removed
4000    e customer's experience, consider the followin...
4001    e customer's experience and potentially increa...
4002    e customer's experience and potentially increa...
4003    e customer's experience, we can focus on the f...
4004    e customer's experience, consider the followin...
Name: response, dtype: object


#### Test 1k

In [None]:
test1k['response'] = generate_response(test1k['prompt'], batch_size=50)
test1k.to_csv(chunks + 'test1k.csv', index=False)

Resuming from checkpoint: 500/999 items processed


Processing batches:   0%|          | 0/10 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing completed, checkpoint removed
0    e customer's experience, we can focus on the f...
1    e customer's experience, the following steps c...
2    e customer's experience, consider implementing...
3    e customer's experience, we can take the follo...
4    e customer's experience, we can focus on the f...
Name: response, dtype: object


### Saving

In [None]:
# Load chunks
train1 = pd.read_csv(chunks + 'train1.csv')
train2 = pd.read_csv(chunks + 'train2.csv')
train3 = pd.read_csv(chunks + 'train3.csv')
train4 = pd.read_csv(chunks + 'train4.csv')
train5 = pd.read_csv(chunks + 'train5.csv')
train6 = pd.read_csv(chunks + 'train6.csv')
train7 = pd.read_csv(chunks + 'train7.csv')
train8 = pd.read_csv(chunks + 'train8.csv')
train9 = pd.read_csv(chunks + 'train9.csv')
train10 = pd.read_csv(chunks + 'train10.csv')

test1 = pd.read_csv(chunks + 'test1.csv')
test2 = pd.read_csv(chunks + 'test2.csv')

train11 = pd.read_csv(chunks + 'train11.csv')
train12 = pd.read_csv(chunks + 'train12.csv')
train13 = pd.read_csv(chunks + 'train13.csv')
train14 = pd.read_csv(chunks + 'train14.csv')
train15 = pd.read_csv(chunks + 'train15.csv')

test1k = pd.read_csv(chunks + 'test1k.csv')

# Print shapes
print('Train:', train1.shape, train2.shape, train3.shape, train4.shape, train5.shape, train6.shape, train7.shape, train8.shape, train9.shape, train10.shape, train11.shape, train12.shape, train13.shape, train14.shape, train15.shape)
print('Test:', test1.shape, test2.shape, test1k.shape)

Train: (1000, 7) (1000, 7) (1000, 7) (1000, 7) (1000, 7) (1000, 7) (1000, 7) (1000, 7) (1000, 7) (999, 7) (1000, 7) (1000, 7) (1000, 7) (1000, 7) (998, 7)
Test: (1000, 7) (1000, 7) (999, 7)


In [None]:
# Concatenate the chunks
train10k = pd.concat([train1, train2, train3, train4, train5, train6, train7, train8, train9, train10], ignore_index=True)

train5k = pd.concat([train11, train12, train13, train14, train15], ignore_index=True)

test2k = pd.concat([test1, test2], ignore_index=True)

In [22]:
# Create the directory if it doesn't exist
save_here = "../data/ready_for_phi-2/"
if not os.path.exists(save_here):
    os.makedirs(save_here)

In [None]:
train_dfs = [train10k, train5k]
test_dfs = [test2k, test1k]

# Save the splits
for i, df in enumerate(train_dfs):
    df.to_csv(save_here + f'train_samples_{i+1}.csv', index=False)
    if save_here + f'train_0{i+1}.csv':
        print(f"File train_0{i+1}.csv saved successfully!")

print("\n")
        
for i, df in enumerate(test_dfs):
    df.to_csv(save_here + f'test_0{i+1}.csv', index=False)
    if save_here + f'test_0{i+1}.csv':
        print(f"File test_0{i+1}.csv saved successfully!")

File train_samples_1.csv saved successfully!
File train_samples_2.csv saved successfully!



File test_samples_1.csv saved successfully!
File test_samples_2.csv saved successfully!
