In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
%cd /gdrive/MyDrive/Colab Notebooks/CSE519/Project
!ls

/gdrive/MyDrive/Colab Notebooks/CSE519/Project
dataframe_1.csv		      preprocess_sample_data.ipynb  sample_data_new.gsheet
huggingface_tokenizers_cache  sample_data_analysis.csv	    sampled_data.csv
ny_plate_process2.ipynb       sample_data_new.csv


In [None]:
import json
import re
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from scipy.stats import multivariate_normal
from transformers import TextStreamer

In [None]:
# Read the data
data = pd.read_csv('dataframe_1.csv')

# Drop rows with missing 'status' values
data_without_random = data.dropna(subset=['status'])

# Filter the data for accepted and rejected statuses
accepted_data = data_without_random[data_without_random['status'] == 'accepted']
rejected_data = data_without_random[data_without_random['status'] == 'rejected']

# Randomly sample 50 samples from each group
sample_accepted = accepted_data.sample(n=50, random_state=42)
sample_rejected = rejected_data.sample(n=50, random_state=42)

# Combine the two samples
sample = pd.concat([sample_accepted, sample_rejected])

# Reset the index of the combined sample
sample = sample.sample(frac=1, random_state=42).reset_index(drop=True)

sample


Unnamed: 0,date,plate,status,personalized,matches_pattern,correct_prediction
0,2013-04-05,I1I1II,rejected,Y,Y,True
1,2011-05-21,SN0MAN,rejected,Y,Y,True
2,2013-01-25,1911COLT,rejected,Y,Y,True
3,2011-04-28,CYBELLE,accepted,Y,Y,True
4,2013-04-13,TSGT,accepted,Y,Y,True
...,...,...,...,...,...,...
95,2014-03-01,FD9II,rejected,Y,Y,True
96,2011-12-01,L3G10N,rejected,Y,Y,True
97,2014-02-06,SCAM1,accepted,Y,Y,True
98,2014-03-22,DATAS5,rejected,Y,Y,True


In [None]:
sample_size = 2

data_without_random = data.dropna(subset=['status'])

sample2 = data_without_random.sample(n=sample_size, random_state=2)
sample2 = sample2.sample(frac=1, random_state=42).reset_index(drop=True)
sample2

Unnamed: 0,date,plate,status,personalized,matches_pattern,correct_prediction
0,2012-09-20,DGB,accepted,Y,Y,True
1,2012-11-20,PEPE5012,accepted,Y,Y,True


In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [None]:
mna_news_instruction = """
You are a DMV clerical staff member responsible for reviewing custom license plate applications to determine their legality.

Follow these steps:
1. **Analyze** the potential meaning of the license plate.
2. **Evaluate** the meaning to decide if it is legal or illegal. A plate is considered **illegal** if the meaning includes or implies references to:
   - Sexual content
   - Violence
   - Racism or hate speech
   - Religion in a derogatory or inappropriate manner

Your response should adhere to the following format:
Plate: [License Plate Text]
Valid: [Legal or Illegal]
Explanation: [Detailed explanation of the plate’s meaning and why it is legal or illegal]

Note: Explanation should be within 50 words.
"""

mna_news_input = """LTL1968"""

In [None]:
# 单条plate 测试
#
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        mna_news_instruction,
        mna_news_input,
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
response = model.generate(**inputs, streamer=text_streamer, max_new_tokens=10000)
response

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:

You are a DMV clerical staff member responsible for reviewing custom license plate applications to determine their legality.

Follow these steps:
1. **Analyze** the potential meaning of the license plate.
2. **Evaluate** the meaning to decide if it is legal or illegal. A plate is considered **illegal** if the meaning includes or implies references to:
   - Sexual content
   - Violence
   - Racism or hate speech
   - Religion in a derogatory or inappropriate manner

Your response should adhere to the following format:
Plate: [License Plate Text]  
Valid: [Legal or Illegal]  
Explaination: [Detailed explanation of the plate’s meaning and why it is legal or illegal]

Note: Explaination should be within 50 words.


### Input:
LTL1968

### Response:
Plate: LTL1968  
Valid: Illegal  
Explanation: LTL1

tensor([[128000,  39314,    374,    459,   7754,    430,  16964,    264,   3465,
             11,  35526,    449,    459,   1988,    430,   5825,   4726,   2317,
             13,   9842,    264,   2077,    430,  36001,  45695,    279,   1715,
            382,  14711,  30151,   1473,   2675,    527,    264,  20804,     53,
          57460,    950,   5687,   4562,   8647,    369,  34988,   2587,   5842,
          12235,   8522,    311,   8417,    872,  89846,    382,  12763,   1521,
           7504,    512,     16,     13,   3146,   2127,  56956,    334,    279,
           4754,   7438,    315,    279,   5842,  12235,    627,     17,     13,
           3146,  83445,    334,    279,   7438,    311,  10491,    422,    433,
            374,   5897,    477,  12079,     13,    362,  12235,    374,   6646,
           3146,  77098,    334,    422,    279,   7438,   5764,    477,  24897,
          15407,    311,    512,    256,    482,  39767,   2262,    198,    256,
            482,  47732,    

In [None]:
type(response)

torch.Tensor

In [None]:
def parse_response(response):
    """
    Parse the `### Response` section to extract Plate, Valid, and Explanation fields.

    Args:
    response (str): The full response text from the LLM.

    Returns:
    dict: A dictionary containing 'Plate', 'Valid', and 'Explanation'.
    """
    result = {}

    # Extract the plate
    plate_match = re.search(r"Plate:\s*(.+)", response)
    result["Plate"] = plate_match.group(1).strip() if plate_match else "N/A"

    # Extract the validity status
    valid_match = re.search(r"Valid:\s*(Legal|Illegal)", response, re.IGNORECASE)
    result["Valid"] = valid_match.group(1).strip().capitalize() if valid_match else "Unknown"

    # Extract the explanation
    explanation_match = re.search(r"Explanation:\s*(.+)", response, re.IGNORECASE)
    result["Explanation"] = explanation_match.group(1).strip() if explanation_match else "N/A"

    return result

In [None]:
# decoded_response = tokenizer.decode(response[0], skip_special_tokens=True)
# parsed_result = parse_response(decoded_response)
# print(type(parsed_result))

<class 'dict'>


In [None]:
# 我写的run Model
def runModel(sample):
    plates = sample['plate'].tolist()

    for idx, plate in enumerate(plates):  # Use enumerate to get index and plate
        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
        inputs = tokenizer(
            [
                alpaca_prompt.format(
                    mna_news_instruction,
                    plate,
                    "",
                )
            ], return_tensors="pt").to("cuda")

        text_streamer = TextStreamer(tokenizer)
        output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=500, eos_token_id=tokenizer.eos_token_id)
        response = tokenizer.decode(output[0], skip_special_tokens=True)

        # Extract "Valid" value
        valid_match = re.search(r"Valid:\s*(Legal|Illegal)", response, re.IGNORECASE)
        sample.at[idx, 'Valid'] = valid_match.group(1).strip().capitalize() if valid_match else "Unknown"

        # Extract "Explanation" value
        explanation_matches = re.findall(r"Explanation:\s*(.+)", response, re.IGNORECASE)

        # Get the last occurrence (if it exists)
        if explanation_matches:
            sample.at[idx, 'Explanation'] = explanation_matches[-1].strip()
        else:
            sample.at[idx, 'Explanation']  = "N/A"


    return sample



sample = runModel(sample)


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:

You are a DMV clerical staff member responsible for reviewing custom license plate applications to determine their legality.

Follow these steps:
1. **Analyze** the potential meaning of the license plate.
2. **Evaluate** the meaning to decide if it is legal or illegal. A plate is considered **illegal** if the meaning includes or implies references to:
   - Sexual content
   - Violence
   - Racism or hate speech
   - Religion in a derogatory or inappropriate manner

Your response should adhere to the following format:
Plate: [License Plate Text]
Valid: [Legal or Illegal]
Explaination: [Detailed explanation of the plate’s meaning and why it is legal or illegal]

Note: Explaination should be within 50 words.


### Input:
I1I1II

### Response:
Plate: I1I1II
Valid: Illegal
Explaination: This plate co

In [None]:
sample

Unnamed: 0,date,plate,status,personalized,matches_pattern,correct_prediction,Valid,Explaination
0,2013-04-05,I1I1II,rejected,Y,Y,True,Illegal,
1,2011-05-21,SN0MAN,rejected,Y,Y,True,Illegal,
2,2013-01-25,1911COLT,rejected,Y,Y,True,Illegal,"This license plate references the year 1911, w..."
3,2011-04-28,CYBELLE,accepted,Y,Y,True,Illegal,
4,2013-04-13,TSGT,accepted,Y,Y,True,Illegal,"TSGT is an abbreviation for the rank of ""Senio..."
...,...,...,...,...,...,...,...,...
95,2014-03-01,FD9II,rejected,Y,Y,True,Legal,
96,2011-12-01,L3G10N,rejected,Y,Y,True,Legal,"The plate's meaning is ""Legal"" and it is legal..."
97,2014-02-06,SCAM1,accepted,Y,Y,True,Illegal,SCAM1 refers to the act of scamming or cheatin...
98,2014-03-22,DATAS5,rejected,Y,Y,True,Legal,The plate is legal as it does not contain any ...


In [None]:
sample['status'].value_counts(), sample['Valid'].value_counts()

(status
 rejected    50
 accepted    50
 Name: count, dtype: int64,
 Valid
 Illegal    63
 Legal      37
 Name: count, dtype: int64)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Convert 'status' and 'Valid' to binary values (0 for Legal, 1 for Illegal)
# 'status' -> 0 for 'accepted', 1 for 'rejected' (ground truth)
# 'Valid' -> 0 for 'Legal', 1 for 'Illegal' (predictions)
status_binary = sample['status'].map({'accepted': 0, 'rejected': 1})
valid_binary = sample['Valid'].map({'Legal': 0, 'Illegal': 1})

# Compute metrics
accuracy = accuracy_score(status_binary, valid_binary)
precision = precision_score(status_binary, valid_binary)
recall = recall_score(status_binary, valid_binary)
f1 = f1_score(status_binary, valid_binary)

# Confusion matrix
cm = confusion_matrix(status_binary, valid_binary)

# Display the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


Accuracy: 0.5100
Precision: 0.5079
Recall: 0.6400
F1-Score: 0.5664
Confusion Matrix:
[[19 31]
 [18 32]]


In [None]:
filtered_row = sample[(sample['status'] == 'accepted') & (sample['Valid'] == 'Illegal')]
filtered_row


Unnamed: 0,date,plate,status,personalized,matches_pattern,correct_prediction,Valid,Explaination
2,2012-11-12,EVILEYEZ,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
5,2011-11-09,ACETIRE,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
6,2011-01-27,XGAMBITX,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
10,2010-12-23,SUNYAB,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
11,2014-06-14,BRBIII,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
...,...,...,...,...,...,...,...,...
91,2014-08-26,PACELEC4,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
92,2014-09-16,WFEGAN,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
94,2011-05-20,HMICUDA,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
95,2012-02-15,W2TAC,accepted,Y,Y,True,Illegal,[Detailed explanation of the plate’s meaning a...
