# Download data
-------------------------

# Imports

In [149]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [150]:
import os
import logging
import re
import numpy as np
import pandas as pd
from pathlib import Path
import socket
import boto3
from botocore.exceptions import ClientError

import sys
sys.path.insert(0, '..')

from src.utils.utils import save_csv, load_yaml, save_yaml, load_pickle, save_pickle
from src.utils.utils import create_directory, download_file_from_url, unzip_file

In [151]:
# Find .env automagically by walking up directories until it's found, then load up the .env entries as environment variables
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
project_dir = os.path.join(os.path.dirname('.env'), os.pardir)

DATA_DIR = os.path.join(project_dir, os.environ.get("DATA_DIR"))
CONFIG_DIR = os.path.join(project_dir, os.environ.get("CONFIG_DIR"))
EXTERNAL_DATA_DIR = os.path.join(project_dir, os.environ.get("EXTERNAL_DATA_DIR"))
RAW_DATA_DIR = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
PROCESSED_DATA_DIR = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))

In [152]:
RAW_ZIP_DATA_DIR = RAW_DATA_DIR + 'zip/'
RAW_CSV_DATA_DIR = RAW_DATA_DIR + 'csv/'

AWS_ACCESS_KEY_ID = load_yaml(CONFIG_DIR+'aws_secret.yml')['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = load_yaml(CONFIG_DIR+'aws_secret.yml')['AWS_SECRET_ACCESS_KEY']

# Part 1 - Load data

In [153]:
file_path = os.path.join(RAW_DATA_DIR, 'master_file_list.txt')
df = pd.read_csv(file_path, sep=' ', header=None, names=['something', 'id', 'url'])
df

Unnamed: 0,something,id,url
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...
...,...,...,...
814731,108304,81dbf12b7dae6baa0cac2d2f9d5ef42c,http://data.gdeltproject.org/gdeltv2/202301131...
814732,5336143,22c9f0e26855dcbd48a0f3c7a3d91906,http://data.gdeltproject.org/gdeltv2/202301131...
814733,77342,0c14e3c325fc7a064e851e9fe0f05734,http://data.gdeltproject.org/gdeltv2/202301131...
814734,128699,669f07e39c97541f908c5b3aaad00d43,http://data.gdeltproject.org/gdeltv2/202301131...


In [154]:
df['year'] = df['url'].apply(lambda x: x.split('/')[-1][:4])
df['month'] = df['url'].apply(lambda x: x.split('/')[-1][4:6])

df

Unnamed: 0,something,id,url,year,month
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
...,...,...,...,...,...
814731,108304,81dbf12b7dae6baa0cac2d2f9d5ef42c,http://data.gdeltproject.org/gdeltv2/202301131...,2023,01
814732,5336143,22c9f0e26855dcbd48a0f3c7a3d91906,http://data.gdeltproject.org/gdeltv2/202301131...,2023,01
814733,77342,0c14e3c325fc7a064e851e9fe0f05734,http://data.gdeltproject.org/gdeltv2/202301131...,2023,01
814734,128699,669f07e39c97541f908c5b3aaad00d43,http://data.gdeltproject.org/gdeltv2/202301131...,2023,01


In [155]:
df_2015_feb = df[(df['year']=='2015') & (df['month']=='02')]
df_2015_feb

Unnamed: 0,something,id,url,year,month
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
...,...,...,...,...,...
2863,147389,0e0e6aa5b8fc8c1453605f03f5e59a72,http://data.gdeltproject.org/gdeltv2/201502282...,2015,02
2864,4949796,0482738270fd04f2d6fe0696fd4cf104,http://data.gdeltproject.org/gdeltv2/201502282...,2015,02
2865,74757,2445a82d47bf437bd50fa69b7c1f0dc2,http://data.gdeltproject.org/gdeltv2/201502282...,2015,02
2866,138860,c58b1ce6b01fe4e3b0a999f025d2e64e,http://data.gdeltproject.org/gdeltv2/201502282...,2015,02


In [156]:
df_2015 = df[df['year']=='2015']
print(df_2015.shape[0]/df.shape[0])
df_2015

0.11098073486380865


Unnamed: 0,something,id,url,year,month
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
...,...,...,...,...,...
90415,178393,d1a167db36ed4b662812f4012f737277,http://data.gdeltproject.org/gdeltv2/201512312...,2015,12
90416,8474142,90c5e5d72e7732639578f2f3a46716c5,http://data.gdeltproject.org/gdeltv2/201512312...,2015,12
90417,67506,8da2e8a6c2adc56ee48b85efe6c3be7f,http://data.gdeltproject.org/gdeltv2/201512312...,2015,12
90418,174329,b75bae9dc91674ff20e56e1652c0afa7,http://data.gdeltproject.org/gdeltv2/201512312...,2015,12


In [157]:
n_files_h43 = 2999
n_files_h54 = 2861
n_files_h55 = 2502
n_files_h30 = 2010

nb_uploaded_files = n_files_h43 + n_files_h54 + n_files_h55 + n_files_h30
print(f"nb_uploaded_files: {nb_uploaded_files} ({round(100 * nb_uploaded_files / df_2015.shape[0], 2)}%)")

nb_uploaded_files: 10372 (11.47%)


### Split rows into hosts

In [158]:
def create_host_file(hosts_no, df):
    hosts = [f'tp-hadoop-{host_no}' for host_no in hosts_no]
    rows_splits = np.array_split(range(df.shape[0]), len(hosts))
    hosts2rows = dict(zip(hosts, rows_splits))
    return hosts2rows


def split_data_per_host(df, hosts_file_path, hosts_no=None, verbose=False):
    if not os.path.isfile(hosts_file_path):
        if hosts_no is None:
            raise ValueError("Select host numero for 'tp-hadoop-XX' (Example: [43, 54, 55, 30]")
        hosts2rows = create_host_file(hosts_no, df)
        save_pickle(hosts2rows, pickle_path=hosts_file_path)
    else:
        hosts2rows = load_pickle(hosts_file_path)
        logging.info(f"Loading pkl file")

    if verbose:
        for host, rows in hosts2rows.items():
            logging.info(f"Host '{host}': {len(rows)} rows.")
            
    return hosts2rows

In [159]:
hosts_file_path = CONFIG_DIR + 'hosts.pkl'
hosts_no = [43, 54, 55, 30]
hosts2rows = split_data_per_host(df_2015, hosts_file_path, hosts_no=hosts_no)
hosts2rows.keys()

dict_keys(['tp-hadoop-43', 'tp-hadoop-54', 'tp-hadoop-55', 'tp-hadoop-30'])

In [160]:
def read_broken_urls(broken_urls_file):
    with open(RAW_DATA_DIR + broken_urls_file, 'r') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]
    return lines


def retrieve_broken_urls_idx(df, broken_urls):
    broken_urls_idx = df[df['url'].isin(broken_urls)].index.values
    return broken_urls_idx


def update_hosts2rows(hosts2rows, broken_urls_idx):
    cnt = 0
    for broken_url_idx in broken_urls_idx:
        for host_name, idxs in hosts2rows.items():
            if broken_url_idx in idxs:
                hosts2rows[host_name] = np.delete(idxs, np.where(idxs==broken_url_idx))
                cnt += 1
                
    print(f"{cnt} rows updated")
    return hosts2rows, cnt


broken_urls_file = 'broken_urls.txt'
broken_urls = read_broken_urls(broken_urls_file)
broken_urls_idx = retrieve_broken_urls_idx(df, broken_urls)
hosts2rows_up, cnt_rows_up = update_hosts2rows(hosts2rows, broken_urls_idx)
if cnt_rows_up > 0:
    save_pickle(hosts2rows, pickle_path=hosts_file_path)

1 rows updated


In [161]:
host_name = 'tp-hadoop-43'
# host_name = socket.gethostname()
df_host = df.iloc[hosts2rows[host_name]].copy()
df_host

Unnamed: 0,something,id,url,year,month
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...,2015,02
...,...,...,...,...,...
22600,289499,05861e3bde69be168c2ca9207b870ddf,http://data.gdeltproject.org/gdeltv2/201505081...,2015,05
22601,11126168,23644f8a1591dfaa232a86c36bc231d2,http://data.gdeltproject.org/gdeltv2/201505081...,2015,05
22602,145017,d2eeba0cdab8470011077823d4fc4397,http://data.gdeltproject.org/gdeltv2/201505081...,2015,05
22603,260653,850998733834198d4fa9226578123aee,http://data.gdeltproject.org/gdeltv2/201505081...,2015,05


In [162]:
host_name
# hosts2rows[host_name]

'tp-hadoop-43'

### Dowload data from web to local computer

In [163]:
def get_files_in_bucket(bucket_name, filter=None):
    s3 = boto3.resource(
        service_name='s3', region_name='eu-west-3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )
    my_bucket = s3.Bucket(bucket_name)
    if filter:
        bucket_files = [my_bucket_object.key for my_bucket_object in my_bucket.objects.filter(Prefix=filter)]
    else:
        bucket_files = [my_bucket_object.key for my_bucket_object in my_bucket.objects.all()]
    return bucket_files



def get_csv_category(csv_filename, pattern=r'\.(.*?)\.'):
    category = re.findall(pattern, csv_filename)
    return category[0]



# def download_data_from_web(urls_to_process, bucket_files, n_urls=None):
#     urls_to_process = urls_to_process[:n_urls]

#     for row, url in enumerate(urls_to_process):
#         zip_file_path = os.path.join(RAW_ZIP_DATA_DIR, Path(url).name)
#         csv_file_path = os.path.join(RAW_CSV_DATA_DIR, Path(url).stem)
#         aws_filename = get_csv_category(Path(url).stem) + "/" + Path(url).stem
        
#         if aws_filename not in bucket_files:
            
#             # Download data from remote url to data/raw/zip
#             create_directory(RAW_ZIP_DATA_DIR)
#             if not os.path.isfile(zip_file_path):
#                 download_file_from_url(remote_url=url, local_dir=RAW_ZIP_DATA_DIR, verbose=True)
#             else:
#                 print(f"[{row+1}/{len(urls_to_process)}] File '{zip_file_path}' already exists.")

#             # Unzip file within data/raw/url to data/raw/csv
#             create_directory(RAW_CSV_DATA_DIR)
#             if not os.path.isfile(csv_file_path):
#                 unzip_file(zip_file_path, RAW_CSV_DATA_DIR, verbose=True)
#             else:
#                 print(f"[{row+1}/{len(urls_to_process)}] File '{csv_file_path}' already exists.")

In [164]:
# # category = 'gkg'
# # raw_csv_dir = "D:/NoSQL/raw/csv/"
# # processed_csv_dir = "D:/NoSQL/raw/csv/"
# # create_directory(raw_csv_dir)
# # create_directory(processed_csv_dir)


# bucket_name = 'gdelt-nosql'
# files_in_bucket = get_files_in_bucket(bucket_name)
# print(f"{len(files_in_bucket)} files within '{bucket_name}' bucket")

# # raw_csv_dir_category = raw_csv_dir + category + '/'

# # category2cols = {
# #     'export': [0, 1, 2, 5, 15, 53],
# #     'mentions': [0, 1, 2],
# #     'gkg': [0, 1, 3, 7, 9, 11, 15]
# # }

# # for file in [f for f in os.listdir(raw_csv_dir_category)]:
# #     csv_file = raw_csv_dir_category + file
# #     df_csv = pd.read_csv(csv_file, sep='\t', header=None, on_bad_lines='skip')[category2cols[category]]
# #     df_csv.to_csv(csv_file.replace('raw', 'processed'))
# #     break
# # df_csv

In [173]:
n_files = df_2015.shape[0]

bucket_name = 'gdelt-nosql'
files_in_bucket = get_files_in_bucket(bucket_name)
n_files_in_bucket = len(files_in_bucket)
print(f"{n_files_in_bucket} files within '{bucket_name}' bucket")
print(f"{n_files_in_bucket}/{n_files} ({round(100*n_files_in_bucket/n_files, 2)}%) files")

4000 files within 'gdelt-nosql' bucket
4000/90420 (4.42%) files


In [166]:
stop()

NameError: name 'stop' is not defined

In [None]:
# def count_files(nb_aws_files, host_name):
#     return 100 * nb_aws_files/len(hosts2rows[host_name])

In [None]:
# bucket_name = "bucket-nosql"
# bucket_files = get_files_in_bucket(bucket_name)

# df_aws = pd.DataFrame(hosts2rows.values(), index=hosts2rows.keys()).T
# # df_aws = df_2015[['url']].copy()
# # df_aws['aws_file'] = df_aws['url'].apply(lambda x: get_csv_category(Path(x).stem) + "/" + Path(x).stem)

# # rows2hosts = {df_aws.at[row, 'aws_file']:host for host,rows in hosts2rows.items() for row in rows}
# # df_aws['host'] = df_aws['aws_file'].apply(lambda x: rows2hosts[x])

# # df_aws['in_s3'] = df_aws['aws_file'].apply(lambda x: x in bucket_files)
# # df_aws = df_aws[df_aws['aws_file'].isin(bucket_files)]

# print(f"bucket_files: {df_aws.shape[0]}/{df.shape[0]} ({round(100 * df_aws.shape[0]/df.shape[0], 2)}%)")
# df_aws

In [None]:
# bucket_name = "bucket-nosql"
# bucket_files = get_files_in_bucket(bucket_name)

# df_aws = pd.DataFrame(hosts2rows.values(), index=hosts2rows.keys()).T
# df_aws = df_aws.unstack().to_frame('index')
# df_aws = df_aws.dropna()
# df_aws['aws_file'] = df_aws['index'].apply(lambda x: df.at[x, 'url'])
# df_aws['aws_file'] = df_aws['aws_file'].apply(lambda x: get_csv_category(Path(x).stem) + "/" + Path(x).stem)
# df_aws = df_aws[df_aws['aws_file'].isin(bucket_files)]
# print(f"bucket_files: {df_aws.shape[0]}/{df.shape[0]} ({round(100 * df_aws.shape[0]/df.shape[0], 2)}%)")
# df_aws

In [None]:
df_aws_gb = df_aws.copy()
df_aws_gb['host'] = df_aws_gb.index.get_level_values(0)
df_aws_gb = df_aws_gb.rename(columns={'aws_file':'nb_aws_file'})
df_aws_gb = df_aws_gb.groupby('host').count()
df_aws_gb['host'] = df_aws_gb.index
df_aws_gb = df_aws_gb.reset_index(drop=True)
df_aws_gb['percentage_host'] = df_aws_gb.apply(lambda x: count_files(nb_aws_files=x.nb_aws_file, host_name=x.host), axis=1)
df_aws_gb['percentage_global'] = df_aws_gb.apply(lambda x: 100*x.nb_aws_file/df_2015.shape[0], axis=1)

df_aws_gb

In [None]:
df_aws_gb = df_aws.groupby(by=['host']).count()
df_aws_gb = df_aws_gb.rename(columns={'aws_file':'nb_aws_file'})
df_aws_gb['host'] = df_aws_gb.index
df_aws_gb = df_aws_gb.reset_index(drop=True)
df_aws_gb['percentage'] = df_aws_gb.apply(lambda x: count_files(nb_aws_files=x.nb_aws_file, host_name=x.host), axis=1)
df_aws_gb

In [None]:
bucket_name = "bucket-nosql"
bucket_files = get_files_in_bucket(bucket_name)


df2 = df_2015[['url']].copy()
df2['aws_file'] = df2['url'].apply(lambda x: get_csv_category(Path(x).stem) + "/" + Path(x).stem)
rows2hosts = {df2.at[row, 'aws_file']:host for host,rows in hosts2rows.items() for row in rows}
len(rows2hosts)

df_aws = pd.DataFrame({'aws_file':bucket_files})    
df_aws['host'] = df_aws['aws_file'].apply(lambda x: rows2hosts[x])
print(f"bucket_files: {df_aws.shape[0]}/{df.shape[0]} ({round(100 * df_aws.shape[0]/df.shape[0], 2)}%)")

def count_files(nb_aws_files, host_name):
    return 100 * nb_aws_files/len(hosts2rows[host_name])

df_aws_gb = df_aws.groupby(by=['host']).count()
df_aws_gb = df_aws_gb.rename(columns={'aws_file':'nb_aws_file'})
df_aws_gb['host'] = df_aws_gb.index
df_aws_gb = df_aws_gb.reset_index(drop=True)
df_aws_gb['percentage'] = df_aws_gb.apply(lambda x: count_files(nb_aws_files=x.nb_aws_file, host_name=x.host), axis=1)
df_aws_gb

In [None]:
df_tmp = df
df_tmp['date'] = df_tmp['url'].apply(lambda x: x.split('/')[-1].split('.')[0])
df_tmp['date'] = df_tmp['date'].apply(lambda x: f"{x[:4]}-{x[4:6]}-{x[6:8]}-{x[8:10]}:{x[10:12]}:{x[12:]}")
df_tmp['date'] = pd.to_datetime(df_tmp['date'])
df_tmp.set_index('date', inplace=True)
df_tmp

In [None]:
per_total = 0
years = [2015+i for i in range(9)]
for year in years: 
    per = df_tmp.loc[str(year)].shape[0] / df.shape[0]
    per_total += per
    print(f"year {year}: {round(per*100, 2)} %")
    
per_total

In [None]:
export_files = get_files_in_bucket("bucket-nosql", filter="export/")
print(len(export_files))
df_export = pd.DataFrame({'filename':export_files})
df_export

In [None]:
df_export_tmp = df_export.copy()
df_export_tmp['date'] = df_export_tmp['filename'].apply(lambda x: x.split('/')[-1].split('.')[0])
df_export_tmp['date'] = df_export_tmp['date'].apply(lambda x: f"{x[:4]}-{x[4:6]}-{x[6:8]}-{x[8:10]}:{x[10:12]}:{x[12:]}")
df_export_tmp['date'] = pd.to_datetime(df_export_tmp['date'])
df_export_tmp.set_index('date', inplace=True)
df_export_tmp

In [None]:
df_export_tmp.loc['2015']

In [None]:
df_export_tmp.loc['2017']

In [None]:
df_export_tmp.loc['2019']

In [None]:
df_export_tmp.loc['2021']

In [None]:
df_aws_gb['percentage'].mean()

In [None]:
# n_urls = 10
# urls_to_process = df_host['url'].tolist()
# download_data_from_web(urls_to_process, bucket_files, n_urls=n_urls)

### Upload data from local computer to AWS S3 bucket

In [None]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client(
        service_name='s3', region_name='eu-west-1',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY 
    )
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True



def upload_file_to_S3_bucket(csv_files, bucket_name):
    bucket_files = get_files_in_bucket(bucket_name)
    for no, filename in enumerate(csv_files):
        file_path_csv = RAW_CSV_DATA_DIR + filename
        file_path_zip = RAW_ZIP_DATA_DIR + filename + '.zip'
        object_name = f"{get_csv_category(filename)}/{filename}"

        # Upload csv file from local computer to AWS S3 bucket    
        if object_name not in bucket_files:
            upload_file(file_path_csv, bucket_name, object_name=object_name)
            os.remove(file_path_csv)
            os.remove(file_path_zip)

In [None]:
# csv_files = [f for f in os.listdir(RAW_CSV_DATA_DIR)]
# csv_files
# upload_file_to_S3_bucket(csv_files, bucket_name)
            
# bucket_files = get_files_in_bucket(bucket_name)
# n_files_in_bucket = len(bucket_files)
# print(f"{n_files_in_bucket} files into AWS S3 bucket")
# bucket_files

In [None]:
# ssh lbrejon-21@tp-1a252-20.enst.fr
# ssh lbrejon-21@tp-1a252-21.enst.fr

In [None]:
# cd /home/users/lbrejon-21

In [None]:
# ssh lbrejon-21@ssh.enst.fr 
# ssh ubuntu@137.194.211.146
# ssh tp-hadoop-55