# Download data
-------------------------

# Imports

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import os
import logging
import re
import numpy as np
import pandas as pd
from pathlib import Path
import socket
import boto3
from botocore.exceptions import ClientError

import sys
sys.path.insert(0, '..')

from src.utils.utils import save_csv, load_yaml, save_yaml, load_pickle, save_pickle
from src.utils.utils import create_directory, download_file_from_url, unzip_file

In [5]:
# Find .env automagically by walking up directories until it's found, then load up the .env entries as environment variables
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
project_dir = os.path.join(os.path.dirname('.env'), os.pardir)

DATA_DIR = os.path.join(project_dir, os.environ.get("DATA_DIR"))
CONFIG_DIR = os.path.join(project_dir, os.environ.get("CONFIG_DIR"))
EXTERNAL_DATA_DIR = os.path.join(project_dir, os.environ.get("EXTERNAL_DATA_DIR"))
RAW_DATA_DIR = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
PROCESSED_DATA_DIR = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))

In [6]:
RAW_ZIP_DATA_DIR = RAW_DATA_DIR + 'zip/'
RAW_CSV_DATA_DIR = RAW_DATA_DIR + 'csv/'

AWS_ACCESS_KEY_ID = load_yaml(CONFIG_DIR+'secret.yml')['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = load_yaml(CONFIG_DIR+'secret.yml')['AWS_SECRET_ACCESS_KEY']

# Part 1 - Load data

In [7]:
file_path = os.path.join(RAW_DATA_DIR, 'master_file_list.txt')
df = pd.read_csv(file_path, sep=' ', header=None, names=['something', 'id', 'url'])
df

Unnamed: 0,something,id,url
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...
...,...,...,...
814731,108304,81dbf12b7dae6baa0cac2d2f9d5ef42c,http://data.gdeltproject.org/gdeltv2/202301131...
814732,5336143,22c9f0e26855dcbd48a0f3c7a3d91906,http://data.gdeltproject.org/gdeltv2/202301131...
814733,77342,0c14e3c325fc7a064e851e9fe0f05734,http://data.gdeltproject.org/gdeltv2/202301131...
814734,128699,669f07e39c97541f908c5b3aaad00d43,http://data.gdeltproject.org/gdeltv2/202301131...


### Split rows into hosts

In [8]:
def create_host_file(hosts_no, df):
    hosts = [f'tp-hadoop-{host_no}' for host_no in hosts_no]
    rows_splits = np.array_split(range(df.shape[0]), len(hosts))
    hosts2rows = dict(zip(hosts, rows_splits))
    return hosts2rows


def split_data_per_host(df, hosts_file_path, hosts_no=None, verbose=False):
    if not os.path.isfile(hosts_file_path):
        if hosts_no is None:
            raise ValueError("Select host numero for 'tp-hadoop-XX' (Example: [43, 54, 55, 30]")
        hosts2rows = create_host_file(hosts_no, df)
        save_pickle(hosts2rows, pickle_path=hosts_file_path)
    else:
        hosts2rows = load_pickle(hosts_file_path)
        logging.info(f"Loading pkl file")

    if verbose:
        for host, rows in hosts2rows.items():
            logging.info(f"Host '{host}': {len(rows)} rows.")
            
    return hosts2rows

In [15]:
hosts_file_path = CONFIG_DIR + 'hosts.pkl'
hosts_no = [43, 54, 55, 30]
hosts2rows = split_data_per_host(df, hosts_file_path, hosts_no=hosts_no)
hosts2rows.keys()

dict_keys(['tp-hadoop-43', 'tp-hadoop-54', 'tp-hadoop-55', 'tp-hadoop-30'])

In [43]:
def read_broken_urls(broken_urls_file):
    with open(RAW_DATA_DIR + broken_urls_file, 'r') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]
    return lines


def retrieve_broken_urls_idx(df, broken_urls):
    broken_urls_idx = df[df['url'].isin(broken_urls)].index.values
    return broken_urls_idx


def update_hosts2rows(hosts2rows, broken_urls_idx):
    cnt = 0
    for broken_url_idx in broken_urls_idx:
        for host_name, idxs in hosts2rows.items():
            if broken_url_idx in idxs:
                hosts2rows[host_name] = np.delete(idxs, np.where(idxs==broken_url_idx))
                cnt += 1
                
    print(f"{cnt} rows updated")
    return hosts2rows, cnt


broken_urls_file = 'broken_urls.txt'
broken_urls = read_broken_urls(broken_urls_file)
broken_urls_idx = retrieve_broken_urls_idx(df, broken_urls)
hosts2rows_up, cnt_rows_up = update_hosts2rows(hosts2rows, broken_urls_idx)
if cnt_rows_up > 0:
    save_pickle(hosts2rows, pickle_path=hosts_file_path)

0 rows updated


In [16]:
host_name = 'tp-hadoop-43'
# host_name = socket.gethostname()
df_host = df.iloc[hosts2rows[host_name]].copy()
df_host

Unnamed: 0,something,id,url
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...
...,...,...,...
203679,9509730,6fbf31cdaf0f56a551b47639cc87f292,http://data.gdeltproject.org/gdeltv2/201701281...
203680,123516,1c25357d1672fd2196baa741de88f3c0,http://data.gdeltproject.org/gdeltv2/201701281...
203681,276332,b95c8cff24387a537af808dbe5fe6dea,http://data.gdeltproject.org/gdeltv2/201701281...
203682,10104191,cc20f52ec30f42d570d10f9cd7f2d4ed,http://data.gdeltproject.org/gdeltv2/201701281...


### Dowload data from web to local computer

In [17]:
def get_files_in_bucket(bucket_name="bucket-nosql"):
    s3 = boto3.resource(
            service_name='s3', region_name='eu-west-1',
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY
        )
    my_bucket = s3.Bucket(bucket_name)
    bucket_files = [my_bucket_object.key for my_bucket_object in my_bucket.objects.all()]
    return bucket_files



def get_csv_category(csv_filename, pattern=r'\.(.*?)\.'):
    category = re.findall(pattern, csv_filename)
    return category[0]



def download_data_from_web(urls_to_process, bucket_files, n_urls=None):
    urls_to_process = urls_to_process[:n_urls]

    for row, url in enumerate(urls_to_process):
        zip_file_path = os.path.join(RAW_ZIP_DATA_DIR, Path(url).name)
        csv_file_path = os.path.join(RAW_CSV_DATA_DIR, Path(url).stem)
        aws_filename = get_csv_category(Path(url).stem) + "/" + Path(url).stem
        
        if aws_filename not in bucket_files:
            
            # Download data from remote url to data/raw/zip
            create_directory(RAW_ZIP_DATA_DIR)
            if not os.path.isfile(zip_file_path):
                download_file_from_url(remote_url=url, local_dir=RAW_ZIP_DATA_DIR, verbose=True)
            else:
                print(f"[{row+1}/{len(urls_to_process)}] File '{zip_file_path}' already exists.")

            # Unzip file within data/raw/url to data/raw/csv
            create_directory(RAW_CSV_DATA_DIR)
            if not os.path.isfile(csv_file_path):
                unzip_file(zip_file_path, RAW_CSV_DATA_DIR, verbose=True)
            else:
                print(f"[{row+1}/{len(urls_to_process)}] File '{csv_file_path}' already exists.")

In [22]:
bucket_name = "bucket-nosql"
bucket_files = get_files_in_bucket(bucket_name)
print(f"bucket_files: {len(bucket_files)}/{df.shape[0]}")
bucket_files

bucket_files: 439/814736


['export/20150218230000.export.CSV',
 'export/20150218231500.export.CSV',
 'export/20150218233000.export.CSV',
 'export/20150218234500.export.CSV',
 'export/20150219000000.export.CSV',
 'export/20150219001500.export.CSV',
 'export/20150219003000.export.CSV',
 'export/20150219004500.export.CSV',
 'export/20150219010000.export.CSV',
 'export/20150219011500.export.CSV',
 'export/20150219013000.export.CSV',
 'export/20150219014500.export.CSV',
 'export/20150219020000.export.CSV',
 'export/20150219021500.export.CSV',
 'export/20150219023000.export.CSV',
 'export/20150219024500.export.CSV',
 'export/20150219030000.export.CSV',
 'export/20150219031500.export.CSV',
 'export/20150219033000.export.CSV',
 'export/20150219034500.export.CSV',
 'export/20150219040000.export.CSV',
 'export/20150219041500.export.CSV',
 'export/20150219043000.export.CSV',
 'export/20150219044500.export.CSV',
 'export/20150219050000.export.CSV',
 'export/20150219051500.export.CSV',
 'export/20150219053000.export.CSV',
 

In [12]:
n_urls = 10
urls_to_process = df_host['url'].tolist()
download_data_from_web(urls_to_process, bucket_files, n_urls=n_urls)

### Upload data from local computer to AWS S3 bucket

In [13]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client(
        service_name='s3', region_name='eu-west-1',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY 
    )
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True



def upload_file_to_S3_bucket(csv_files, bucket_name):
    bucket_files = get_files_in_bucket(bucket_name)
    for no, filename in enumerate(csv_files):
        file_path_csv = RAW_CSV_DATA_DIR + filename
        file_path_zip = RAW_ZIP_DATA_DIR + filename + '.zip'
        object_name = f"{get_csv_category(filename)}/{filename}"

        # Upload csv file from local computer to AWS S3 bucket    
        if object_name not in bucket_files:
            upload_file(file_path_csv, bucket_name, object_name=object_name)
            os.remove(file_path_csv)
            os.remove(file_path_zip)

In [14]:
csv_files = [f for f in os.listdir(RAW_CSV_DATA_DIR)]
upload_file_to_S3_bucket(csv_files, bucket_name)
            
bucket_files = get_files_in_bucket(bucket_name)
n_files_in_bucket = len(bucket_files)
print(f"{n_files_in_bucket} files into AWS S3 bucket")
bucket_files

10 files into AWS S3 bucket


['export/20150218230000.export.CSV',
 'export/20150218231500.export.CSV',
 'export/20150218233000.export.CSV',
 'export/20150218234500.export.CSV',
 'gkg/20150218230000.gkg.csv',
 'gkg/20150218231500.gkg.csv',
 'gkg/20150218233000.gkg.csv',
 'mentions/20150218230000.mentions.CSV',
 'mentions/20150218231500.mentions.CSV',
 'mentions/20150218233000.mentions.CSV']

In [15]:
# ssh lbrejon-21@tp-1a252-20.enst.fr
# ssh lbrejon-21@tp-1a252-21.enst.fr

In [16]:
# cd /home/users/lbrejon-21

In [17]:
# ssh lbrejon-21@ssh.enst.fr 
# ssh ubuntu@137.194.211.146
# ssh tp-hadoop-55