# First Look at Safebooru Dataset

### Import dataset from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! unzip /content/drive/MyDrive/archive.zip -d /content/safebooru/

### Preview dataset in a table

In [None]:
import numpy as np
import pandas as pd

samples = 2700000

df_raw = pd.read_csv('/content/safebooru/all_data.csv', nrows = samples)
df_raw.head(samples)

In [None]:
df_raw.info()

In [None]:
import matplotlib.pyplot as plt

View table of random examples

In [None]:
examples = df_raw.sample(n=12) 
examples.to_csv('out.csv')

In [None]:
examples

## Selecting the Relevant Samples

### Selecting Relevant Columns (sample_url and tags)

In [None]:
features = ['sample_url', 'tags']
df_X = df_raw[features]
df_X.columns

In [None]:
df_X.head(samples)

### Selecting appropriate tags

In [None]:
import re
tag = []
for i in df_X.tags:
  tokens = re.split("[ ]", i)
  for token in tokens:
    tag.append(token)

print(len(tag))
# preview first 10 tags
tag[:10]

#### Print out number of samples for each relevant tag

In [None]:
chosen = []
tag_dictionary = {}
for i in df_X.tags:
  tokens = re.split("[ ]", i)
  for token in tokens:
    if token in ['1girl', '1boy', 'white_background', 'full_body']:
      chosen.append(token)
for j in chosen:
  tag_dictionary[j] = tag_dictionary.get(j,0)+1

print(tag_dictionary)
    

#### Select male and female samples using tag intersection

In [None]:
ftags = ['1girl']
mtags = ['1boy']
chosenf = []
chosenm = []
for index in df_X.index:
    tag = df_X['tags'][index]
    tokens = re.split("[ ]", tag)
    if '1girl' in tokens:
      if 'white_background' in tokens:
        if 'full_body' in tokens:
          chosenf.append(df_X['sample_url'][index])
    if '1boy' in tokens:
      if 'white_background' in tokens:
        if 'full_body' in tokens:
          chosenm.append(df_X['sample_url'][index])

print(len(chosenf), " ", len(chosenm))

In [None]:
plt.figure(figsize=(7,7))
labels = ["Female characters: " +str(len(chosenf)), "Male characters: "+str(len(chosenm))]
colors = ["#A28BE1", "#F3D277"]
sizes = [len(chosenf), len(chosenm)]
plt.pie(sizes, labels=labels, colors=colors, wedgeprops=dict(width=0.3),textprops={'fontsize': 14})
plt.show()

#### Extract images from selected samples and export as archive files

In [None]:
! mkdir /content/fimages
! mkdir /content/mimages

In [None]:
for i in range(1758):
  try:
    urllib.request.urlretrieve("http:" +chosenf[i], "/content/fimages/img{}.jpg".format(i) )
    urllib.request.urlretrieve("http:" +chosenm[i], "/content/mimages/img{}.jpg".format(i) )
  except HTTPError as err:
   if err.code == 404:
       pass
  i +=1


In [None]:
!zip -r /content/fimg.zip /content/fimages
!zip -r /content/mimg.zip /content/mimages

In [None]:
from google.colab import files

In [None]:
files.download("/content/fimg.zip")

In [None]:
files.download("/content/mimg.zip")