# Library

In [10]:
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm
from PIL import ImageEnhance, Image
from pathlib import Path
import io
import random
import cv2

# Load dataset

In [3]:
original_cars_df = load_dataset('khoadole/cars_8k_balance_dataset')
cars_df = pd.DataFrame(original_cars_df["train"])

### Remove no car images

In [5]:
# Load model
def load_model():
	model = torch.hub.load('ultralytics/yolov5', 'yolov5x', pretrained=True, trust_repo='check')
	model.conf = 0.1 # decrease confidence => higher car detect : min 0.1(conf)
	return model
	
def get_no_cars_rows(df : pd.DataFrame, no_cars_path : str, no_cars_rows : list):
	model = load_model()
	for index, row in tqdm(df.iterrows(), total=len(df)):
		image_bytes = row['image']

		image_raw = Image.open(io.BytesIO(image_bytes["bytes"]))
		image = image_raw
		image = image.resize((640, 480), Image.LANCZOS)

		enhancer = ImageEnhance.Brightness(image)
		image = enhancer.enhance(0.5)
		enhancer = ImageEnhance.Contrast(image)
		image = enhancer.enhance(0.5)

		results = model(image)

		detections = results.pandas().xyxy[0]

		if 'car' in detections['name'].values:
			print()
		else:
			# Save raw no car detected images
			# no_cars_path.mkdir(exist_ok=True)
			# image_raw.save(no_cars_path / f"{index}.jpg")

			# results.show()
			# display(image)

			no_cars_rows.append(index)

	return no_cars_rows

# Save cols
def save_file(filename: str, data : list):
	with open(filename, "w") as file:
		file.write("\n".join(map(str, data)))

# Load cols
def load_cols(filename: str):
	with open(filename, "r") as file:
		data = [int(line.strip()) for line in file]
	return data

# Load dataframe
cars_short = cars_df

## Load cols from file(if exists)
if Path("../data/no_cars_rows.txt").exists():
	no_cars_rows = load_cols("../data/no_cars_rows.txt")
else:
	# Path
	cwd = Path.cwd()
	no_cars_dir = cwd / 'no_cars_detected'
	
	no_cars_rows = []
	no_cars_rows = get_no_cars_rows(cars_short, no_cars_dir, no_cars_rows)
	
	save_file("../data/no_cars_rows.txt", no_cars_rows)

# Drop rows
cars_short.drop(no_cars_rows, inplace=True)

# Split train, validation, test

In [9]:
train, temp = train_test_split(cars_short, test_size=0.4, stratify=cars_short["name"], random_state=14)
validation, test = train_test_split(temp, test_size=0.5, stratify=temp["name"], random_state=14)

# Make QA

In [11]:
# Load questions
with open('../local_data/question/brand_questions.txt', 'r') as f:
    brand_questions = f.read().splitlines()
with open('../local_data/question/color_questions.txt', 'r') as f:
    color_questions = f.read().splitlines()
with open('../local_data/question/type_questions.txt', 'r') as f:
    type_questions = f.read().splitlines()

###
def pipeline_create_qa_dataset(df : pd.DataFrame):
    qa_dataset = []
    for index, row in tqdm(df.iterrows(), total=len(df)):
        # key = row["name"]

        # Get questions list
        brand_q = random.choice(brand_questions)
        color_q = random.choice(color_questions)
        type_q = random.choice(type_questions)

        qa_dataset.append({
            "id": row['id'],
            "image": row['image'],
            "question": brand_q,
            "answer": row['brand'],
        })
        qa_dataset.append({
            "id": row['id'],
            "image": row['image'],
            "question": color_q,
            "answer": row['Exterior color'],
        })
        qa_dataset.append({
            "id": row['id'],
            "image": row['image'],
            "question": type_q,
            "answer": row['name'],
        })
    return qa_dataset

###
train = pipeline_create_qa_dataset(train)
validation = pipeline_create_qa_dataset(validation)
test = pipeline_create_qa_dataset(test)

train = pd.DataFrame(train)
validation = pd.DataFrame(validation)
test = pd.DataFrame(test)

  0%|          | 0/4526 [00:00<?, ?it/s]

  0%|          | 0/1509 [00:00<?, ?it/s]

  0%|          | 0/1509 [00:00<?, ?it/s]

# Make Dataset