In [90]:
from arango import ArangoClient
import pandas as pd
import os

In [46]:
# Initialize the client for ArangoDB.
client = ArangoClient(hosts='http://localhost:8529')

In [47]:
client.version

'5.4.0'

In [48]:
client.hosts

['http://localhost:8529']

In [49]:
# Create "test" database.
sys_db = client.db(username='root', password='HHeXW)YE3rm8cnPw')
if sys_db.has_database('test'):
    sys_db.delete_database('test')
sys_db.create_database('test')

True

In [50]:
# Connect to "test" database as root user.
db = client.db('test', username='root', password='HHeXW)YE3rm8cnPw')

In [51]:
# Create collection "students"
if db.has_collection('students'):
    students = db.collection('students')
else:
    students = db.create_collection('students')

In [53]:
# Add a hash index to the collection.
students.add_hash_index(fields=['name'], unique=False)

{'id': '55306279',
 'fields': ['name'],
 'type': 'hash',
 'name': 'idx_1661134881335279616',
 'deduplicate': True,
 'sparse': False,
 'unique': False,
 'selectivity': 1,
 'new': True}

In [54]:
# Truncate the collection.
students.truncate()

True

In [55]:
# Insert new documents into the collection.
students.insert({'name': 'jane', 'age': 19})
students.insert({'name': 'josh', 'age': 18})
students.insert({'name': 'jake', 'age': 21})

{'_id': 'students/55307098', '_key': '55307098', '_rev': '_aLgUQda---'}

In [59]:
# Execute an AQL query. This returns a result cursor.
cursor = db.aql.execute('FOR doc IN students RETURN doc')

In [60]:
# Iterate through the cursor to retrieve the documents.
student_recs = [document for document in cursor]
student_recs

[{'_key': '55307094',
  '_id': 'students/55307094',
  '_rev': '_aLgUQdC---',
  'name': 'jane',
  'age': 19},
 {'_key': '55307096',
  '_id': 'students/55307096',
  '_rev': '_aLgUQdO---',
  'name': 'josh',
  'age': 18},
 {'_key': '55307098',
  '_id': 'students/55307098',
  '_rev': '_aLgUQda---',
  'name': 'jake',
  'age': 21}]

In [107]:
report = pd.read_csv("/Users/milko/Local/Data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/03-13-2020.csv")
report.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,China,2020-03-13T11:09:03,67786,3062,51553,30.9756,112.2707
1,Guangdong,China,2020-03-13T11:09:03,1356,8,1296,23.3417,113.4244
2,Henan,China,2020-03-11T08:13:09,1273,22,1249,33.882,113.614
3,Zhejiang,China,2020-03-12T01:33:02,1215,1,1197,29.1832,120.0934
4,Hunan,China,2020-03-13T11:09:03,1018,4,1005,27.6104,111.7088


In [108]:
report.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  113 non-null    object 
 1   Country/Region  230 non-null    object 
 2   Last Update     230 non-null    object 
 3   Confirmed       230 non-null    int64  
 4   Deaths          230 non-null    int64  
 5   Recovered       230 non-null    int64  
 6   Latitude        230 non-null    float64
 7   Longitude       230 non-null    float64
dtypes: float64(2), int64(3), object(3)
memory usage: 14.5+ KB


In [109]:
# Remove unused columns.
for column in ['Last Update', 'Latitude', 'Longitude']:
    if column in report.columns:
        report.drop(column, axis=1, inplace=True)
report

Unnamed: 0,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,Hubei,China,67786,3062,51553
1,Guangdong,China,1356,8,1296
2,Henan,China,1273,22,1249
3,Zhejiang,China,1215,1,1197
4,Hunan,China,1018,4,1005
...,...,...,...,...,...
225,,Afghanistan,7,0,0
226,,Monaco,2,0,0
227,,Liechtenstein,1,0,0
228,,Guyana,1,1,0


In [110]:
# Rename columns.
report.rename(
    columns={
        'Country/Region': 'Country'
    }, inplace=True)
report.head()

Unnamed: 0,Province/State,Country,Confirmed,Deaths,Recovered
0,Hubei,China,67786,3062,51553
1,Guangdong,China,1356,8,1296
2,Henan,China,1273,22,1249
3,Zhejiang,China,1215,1,1197
4,Hunan,China,1018,4,1005


In [111]:
# Group by country.
report = report.groupby('Country').sum()
report.head()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,7,0,0
Albania,33,1,0
Algeria,26,2,8
Andorra,1,0,0
Antigua and Barbuda,1,0,0


In [112]:
# Add date.
month, day, year = os.path.basename('03-13-2020')[:10].split('-')
date = '-'.join([year, month, day])
report['Date']  = date
report.head()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Date
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,7,0,0,2020-03-13
Albania,33,1,0,2020-03-13
Algeria,26,2,8,2020-03-13
Andorra,1,0,0,2020-03-13
Antigua and Barbuda,1,0,0,2020-03-13


In [113]:
# Add rate columns.
report['Drate'] = (report['Deaths'] / report['Confirmed']) * 100
report['Rrate'] = (report['Recovered'] / report['Confirmed']) * 100
report.head()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Date,Drate,Rrate
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,7,0,0,2020-03-13,0.0,0.0
Albania,33,1,0,2020-03-13,3.030303,0.0
Algeria,26,2,8,2020-03-13,7.692308,30.769231
Andorra,1,0,0,2020-03-13,0.0,0.0
Antigua and Barbuda,1,0,0,2020-03-13,0.0,0.0


In [114]:
# Convert index to column.
report.reset_index(drop=False, inplace=True)
report.head()

Unnamed: 0,Country,Confirmed,Deaths,Recovered,Date,Drate,Rrate
0,Afghanistan,7,0,0,2020-03-13,0.0,0.0
1,Albania,33,1,0,2020-03-13,3.030303,0.0
2,Algeria,26,2,8,2020-03-13,7.692308,30.769231
3,Andorra,1,0,0,2020-03-13,0.0,0.0
4,Antigua and Barbuda,1,0,0,2020-03-13,0.0,0.0


In [13]:
# Get connection to ArangoDB collection.
client = ArangoClient(hosts='http://localhost:8529')
db = client.db('COVID-19', username='root', password='HHeXW)YE3rm8cnPw')

In [14]:
# Read records
cursor = db.aql.execute(
    'FOR doc IN daily FILTER doc.Country == @country RETURN doc',
    bind_vars={'country': 'Italy'}
)    
print(cursor.count())

None


In [131]:
df = pd.DataFrame.from_records(
        [doc for doc in cursor],
        index=['Date'],
        exclude=['_key', '_id', '_rev', 'Country']
    )
df.head()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Drate,Rrate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-13,453,12,3,2.649007,0.662252
2020-03-13,655,17,45,2.59542,6.870229
2020-03-13,3,0,0,0.0,0.0
2020-03-13,3,0,0,0.0,0.0
2020-03-13,10149,631,724,6.217361,7.133708


In [1]:
import os
import glob
import pandas as pd
import numpy as np
from arango import ArangoClient

def COVID19_loadDB(path, ip='http://localhost:8529', dbname='COVID-19', col_name='daily'):
    '''
    Load database with all reports in provided directory.
    
    INPUT
        path: Directory containing reports.
        dbname: Database name.
    '''
    
    # Get client and system database.
    client = ArangoClient(hosts=ip)
    sys_db = client.db(username='root', password='HHeXW)YE3rm8cnPw')
    
    # Create/open database.
    if not sys_db.has_database(dbname):
        sys_db.create_database(dbname)
    db = client.db(dbname, username='root', password='HHeXW)YE3rm8cnPw')
    
    # Create/truncate collection.
    if db.has_collection(col_name):
        collection = db.collection(col_name)
        collection.truncate()
    else:
        collection = db.create_collection(col_name)
    
    # Iterate reports.
    for file in glob.glob(path + "*.csv"):
        
        # Extract date.
        month, day, year = os.path.basename(file)[:10].split('-')
        date = '-'.join([year, month, day])
        print(date)
        
        # Load report.
        report = pd.read_csv(file)
        
        # Remove unused columns.
        for column in ['Last Update', 'Latitude', 'Longitude']:
            if column in report.columns:
                report.drop(column, axis=1, inplace=True)
        
        # Rename columns.
        report.rename(
            columns={
                'Country/Region': 'Country'
            }, inplace=True)
        
        # Group by country.
        report = report.groupby('Country').sum()
        
        # Add date.
        report['Date']  = date
        
        # Add rate columns.
        report['Drate'] = (report['Deaths'] / report['Confirmed']) * 100
        report['Rrate'] = (report['Recovered'] / report['Confirmed']) * 100
        
        # Replace NaN with zero.
        report = report.fillna(0)
        
        # Convert index to column.
        report.reset_index(drop=False, inplace=True)
        
        # Write to database.
        collection.insert_many(report.to_dict(orient='records'), silent=True)
        
def COVID19_Country2CSV(country, ip='http://localhost:8529', dbname='COVID-19', col_name='daily'):
    '''
    Save a CSV file with provided country data.

    INPUT
        country: Name of country.
    '''
    
    # Get connection to ArangoDB collection.
    client = ArangoClient(hosts=ip)
    db = client.db(dbname, username='root', password='HHeXW)YE3rm8cnPw')

    # Get data from database.
    cursor = db.aql.execute(
        "FOR doc IN daily FILTER doc.Country == @country RETURN doc",
        bind_vars={'country': country}
    )    

    # Get DataFrame.
    print(type(cursor))
    print(type(cursor.count()))
    print([doc for doc in cursor])
#     df = pd.DataFrame.from_records([doc for doc in cursor])
#     return df
#     df = pd.DataFrame.from_records(
#             [doc for doc in cursor],
#             index=['Date'],
#             exclude=['_key', '_id', '_rev', 'Country']
#         )

#     # Sort data frame by date in ascending order.
#     df.sort_index(inplace=True)

#     # Add delta values.
#     df = pd.concat(
#         [
#             df,
#             df[['Confirmed', 'Recovered', 'Deaths']]
#                 .diff(axis=0)
#                 .rename(columns={
#                     'Confirmed': 'NewConfirmed',
#                     'Recovered': 'NewRecovered',
#                     'Deaths':    'NewDeaths'
#                 })
#         ],
#         axis=1
#     )

#     # Normalise NaN values.
#     df.fillna(0, inplace=True)

#     # Export as csv.
#     df.to_csv(f"{country}.csv")

def COVID19_Country2DF(country, ip='http://localhost:8529', dbname='COVID-19', col_name='daily'):
    '''
    Return a DataFrame with provided country data.

    INPUT
        country: Name of country.
    
    OUTPUT
        DataFrame: Country data with delta values.
    '''
    
    # Get connection to ArangoDB collection.
    client = ArangoClient(hosts=ip)
    db = client.db(dbname, username='root', password='HHeXW)YE3rm8cnPw')

    # Get data from database.
    cursor = db.aql.execute(
        'FOR doc IN daily FILTER doc.Country == @country RETURN doc',
        bind_vars={'country': country}
    )    

    # Get DataFrame.
    df = pd.DataFrame.from_records(
            [doc for doc in cursor],
            index=['Date'],
            exclude=['_key', '_id', '_rev', 'Country']
        )

    # Sort data frame by date in ascending order.
    df.sort_index(inplace=True)

    # Add delta values.
    df = pd.concat(
        [
            df,
            df[['Confirmed', 'Recovered', 'Deaths']]
                .diff(axis=0)
                .rename(columns={
                    'Confirmed': 'NewConfirmed',
                    'Recovered': 'NewRecovered',
                    'Deaths':    'NewDeaths'
                })
        ],
        axis=1
    )        

    # Normalise NaN values.
    df.fillna(0, inplace=True)

    # return.
    return df


In [2]:
x = COVID19_Country2CSV('Italy.csv')
x

<class 'arango.cursor.Cursor'>
<class 'NoneType'>
[]


In [8]:
country = 'Italy'
ip = 'http://localhost:8529'
dbname='COVID-19'
col_name = 'daily'

In [9]:
# Get connection to ArangoDB collection.
client = ArangoClient(hosts=ip)
db = client.db(dbname, username='root', password='HHeXW)YE3rm8cnPw')

In [10]:
# Get data from database.
cursor = db.aql.execute(
    'FOR doc IN daily FILTER doc.Country == @country RETURN doc',
    bind_vars={'country': country}
)    

In [11]:
# Get data frame.
df = pd.DataFrame.from_records(
            [doc for doc in cursor],
            index=['Date'],
            exclude=['_key', '_id', '_rev', 'Country']
        )
df.head()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Drate,Rrate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-26,453,12,3,2.649007,0.662252
2020-02-27,655,17,45,2.59542,6.870229
2020-02-18,3,0,0,0.0,0.0
2020-02-19,3,0,0,0.0,0.0
2020-03-10,10149,631,724,6.217361,7.133708


In [12]:
x = df.iteritems()

AttributeError: 'generator' object has no attribute 'next'

In [143]:
# Sort data frame by date in ascending order.
df.sort_index(inplace=True)

In [144]:
df.head()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Drate,Rrate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-13,453,12,3,2.649007,0.662252
2020-03-13,655,17,45,2.59542,6.870229
2020-03-13,3,0,0,0.0,0.0
2020-03-13,3,0,0,0.0,0.0
2020-03-13,10149,631,724,6.217361,7.133708
