<h3>Tratamento dataset de Gatos</h3>

In [2]:
import pandas as pd

dataset_cats = pd.read_csv('../data/cats.csv')
dataset_cats.drop(['Unnamed: 0', 'id', 'url', 'med_photos', 'size', 'age'], axis=1, inplace=True)

import ast
dataset_cats["photos"] = dataset_cats["photos"].apply(ast.literal_eval)
dataset_cats['photos'] = dataset_cats['photos'].map(lambda foto: foto[0]["large"])

dataset_cats = dataset_cats.rename(columns={'photos': 'link_photo'})

<b>Salvando dataset_dogss</b>

In [3]:
dataset_cats.to_csv('../data/cats_clean.csv')

<h3>União dos datasets "limpados" acima.</h3>

In [4]:
import csv

dicionario_ceps = {}

with open('../data/ceps.txt', 'r') as file:
    csvreader = csv.reader(file, delimiter=';')
    for linha in csvreader:
        dicionario_ceps[linha[0]] = linha

In [5]:
import numpy as np
np.random.seed(77)

chaves_dicionario = np.array(list(dicionario_ceps.keys()))

dataset_cats["ZipCode"] = np.NAN

def get_cep():
    key = np.random.choice(chaves_dicionario)
    info_cep = dicionario_ceps[f'{key}']
    cep = info_cep[0]
    return cep

def get_cidade(cep):
    return dicionario_ceps[cep][1]

dataset_cats["ZipCode"] = dataset_cats["ZipCode"].map(lambda _: get_cep())
dataset_cats["Borough"] = dataset_cats["ZipCode"].map(lambda cep: get_cidade(cep))

dataset_cats.to_csv("../data/cats_clean.csv")

<h3>Tratamento das fotos da base</h3>
<ul>
    <li>Obter a foto da URL</li>
    <li>Converter para base64</li>
</ul>

In [6]:
import requests
import base64

def get_base64_from_photo_url(url):
    if not url:
        return None
    
    response = requests.get(url)
    imagem_bytes = response.content
    imagem_base64 = base64.b64encode(imagem_bytes).decode('utf-8')
    return imagem_base64

In [7]:
dataset_cats = dataset_cats.rename(columns={'link_photo': 'Image'})

In [8]:
import pandas as pd
import base64
from tqdm.notebook import tqdm
import asyncio
import aiohttp
import time

def is_base64(s):
    try:
        base64.b64decode(s)
        return True
    except Exception:
        return False


async def download_images(index_range, union_dataset, progress_bar):
    timeout = aiohttp.ClientTimeout(120)
    async with aiohttp.ClientSession(timeout=timeout) as session:
        for index in index_range:
            row = union_dataset.iloc[index]
            if row["Image"] and not is_base64(row["Image"]):
                try:
                    async with session.get(row["Image"]) as response:
                        imagem_bytes = await response.read()
                        imagem_base64 = base64.b64encode(imagem_bytes).decode('utf-8')
                        union_dataset.loc[index, 'Image'] = imagem_base64
                except Exception as err:
                    pass
                progress_bar.update(1)

async def async_process_dataset(union_dataset, batch_size=50):
    tasks = []
    with tqdm(total=len(union_dataset), desc="Downloading images") as progress_bar:
        for i in range(0, len(union_dataset), batch_size):
            index_range = range(i, min(i+batch_size, len(union_dataset)))
            task = asyncio.create_task(download_images(index_range, union_dataset, progress_bar))
            tasks.append(task)
        await asyncio.gather(*tasks)

In [9]:
# Executa o loop de eventos do asyncio no kernel do Jupyter Notebook
await async_process_dataset(dataset_cats)

Downloading images:   0%|          | 0/67145 [00:00<?, ?it/s]

In [11]:
# Contando o número de valores nulos na coluna "Image"
num_null = dataset_cats['Image'].isnull().sum()
print(f"O número de valores nulos na coluna 'Image' é: {num_null}")

O número de valores nulos na coluna 'Image' é: 0


In [10]:
dataset_cats.to_csv("../data/cats_clean.csv")