# Download data

We have data in google drive, and in this notebook we download it locally. 
- Get the service account key from 1 password, and place it in the following folder: 
```
authentication/toxfox_key.json
```
- A new folder 'data' is created with the following structure: 
```
data/ 
    product_0/
        Barcode/
        Inhaltsstoffe/
        formular_answers.csv
    product_1/
    product_2/
    ... 
```


In [7]:
import os
import gspread 
import pandas as pd
import io
import yaml

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload


In [8]:
key_name = "toxfox-785dcbf7e0e8.json"
credentials_paths = os.path.join(os.path.dirname(os.getcwd()), 'authentication', key_name)

In [9]:
# Define gdrive and gspread 
scopes = [
    'https://www.googleapis.com/auth/spreadsheets.readonly',
    'https://www.googleapis.com/auth/drive'
]

credentials = Credentials.from_service_account_file(
     credentials_paths,
     scopes=scopes 
)

gspread = gspread.authorize(credentials)
gdrive = build('drive', 'v3', credentials=credentials)


In [10]:
# URL to our spreadsheet 
sheet = gspread.open_by_url("https://docs.google.com/spreadsheets/d/1FNWNfpF9nGIEdwVllR6OWesE4aZnAHxeiuv7Zbb4rbI/edit#gid=836349214")
answer = sheet.get_worksheet(0)
all_answers = answer.get_all_values()
dataframe = pd.DataFrame(all_answers[1:], columns=all_answers[0])



In [11]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data','raw')

# Create data directory 
if not os.path.exists(data_dir):
    os.makedirs(data_dir)


In [12]:

def download_file(save_path:str, file_name: str, file_id: str):
    '''Download a file from Google Drive'''
    request_file = gdrive.files().get_media(fileId=file_id)
    file = io.BytesIO()
    downloader = MediaIoBaseDownload(file, request_file)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    file_retrieved = file.getvalue()
    with open(os.path.join(save_path, file_name), 'wb') as f:
        f.write(file_retrieved)

In [13]:
# Download all images 
columns = dataframe.columns 
image_columns = ['Inhaltsstoffe', 'Barcode']
drop_columns = ['E-Mail-Adresse']

print(f"Downloading data from {len(dataframe)} products...")

for index in dataframe.index[0:2]: 
    formular_answers = dataframe[dataframe.columns[~dataframe.columns.isin(image_columns+drop_columns)]].iloc[index]

    print("downloading product: ", index)
    save_dir = os.path.join(data_dir, f'product_{index}')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Save anwers 
    with open(os.path.join(save_dir, 'formular_answers.yaml'), 'w') as file:
        answers = yaml.dump( formular_answers.to_dict(), file) 
    
    for image_column in image_columns:
        url_string = dataframe[image_column][index]
        urls = url_string.split(',')
        images_path = os.path.join(save_dir, image_column)
        if not os.path.exists(images_path):
            os.makedirs(images_path)

        for url in urls: 
            id = url.split('id=')[-1]
            image_metadata = gdrive.files().get(fileId=id).execute()
            image_name = image_metadata['name']
            download_file(save_path = images_path,file_name = image_name, file_id = id)
             


Downloading data from 58 products...
downloading product:  0
downloading product:  1


In [17]:
text = yaml.dump(formular_answers) 

In [21]:
with open('test_df_to_yaml.yaml', 'w') as file:
    documents = yaml.dump( formular_answers, file, default_flow_style=False)

In [23]:
formular_answers.to_dict()

{'Zeitstempel': '23.05.2024 11:50:06',
 'Mit welchem Modell wurden die Bilder gemacht? ': 'Android',
 'Name von Produkt': 'Play It Sexy',
 'Marke': 'Playboy'}

In [25]:
with open('test_df_to_yaml.yaml', 'w') as file:
    documents = yaml.dump( formular_answers.to_dict(), file) 

In [26]:
read = yaml.load(open('test_df_to_yaml.yaml'), Loader=yaml.FullLoader)

In [30]:
read['Mit welchem Modell wurden die Bilder gemacht? ']

'Android'