# Dataframe_read_csv

## read_csv
 - [Simple](#Simple) 
 - [Manage type](#Manage-type) 
 - [Manage date conversion](#Manage-date-conversion)
 - [Simple lazy load with dask](#Simple-lazy-load-with-dask)
 - [Lazy load with processing and condition](#Lazy-load-with-processing-and-condition)
 - [Lazy load multi files](#Lazy-load-multi-files)

In [76]:
import os
import pandas as pd
import numpy as np
import datetime
import dask
import dask.dataframe as dd

## Simple

In [174]:
# Simple
path_file = "data/users_2018_11.csv"
users = pd.read_csv(
    path_file,
    sep=';',
    index_col=0)

display(users)
display(users.index)
display(users.columns)
display(users.dtypes)

Unnamed: 0_level_0,name,email,age,size,timestamp,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Nolan,nolan@example.com,27.0,1.75,1544090000.0,2018-11-20 10:15:12
1,Evan,evan@example.com,34.0,1.65,,
2,Tess,tess@example.com,26.0,,1544089000.0,2018-11-19 10:10:42
3,Linda,linda@example.com,,1.83,1544068000.0,2018-11-8 18:12:48


Int64Index([0, 1, 2, 3], dtype='int64', name='id')

Index(['name', 'email', 'age', 'size', 'timestamp', 'date'], dtype='object')

name          object
email         object
age          float64
size         float64
timestamp    float64
date          object
dtype: object

## Manage type

In [175]:
# Manage type and convertion
path_file = "data/users_2018_11.csv"

dtype ={
    'name': 'str',
    'email': 'str',
    'age': np.float64,  # int but Nullable
    'size': np.float64,
    'timestamp': 'str', # Datetime
    'date': 'str' # timestamp
    }

users = pd.read_csv(
    path_file,
    sep=';',
    dtype=dtype,
    index_col=0)

display(users)
display(users.dtypes)

Unnamed: 0_level_0,name,email,age,size,timestamp,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Nolan,nolan@example.com,27.0,1.75,1544090329.0,2018-11-20 10:15:12
1,Evan,evan@example.com,34.0,1.65,,
2,Tess,tess@example.com,26.0,,1544089309.0,2018-11-19 10:10:42
3,Linda,linda@example.com,,1.83,1544067709.0,2018-11-8 18:12:48


name          object
email         object
age          float64
size         float64
timestamp     object
date          object
dtype: object

## Manage date conversion

In [176]:
# Manage type and convertion
path_file = "data/users_2018_11.csv"

dtype ={
    'name': 'str',
    'email': 'str',
    'age': np.float64,  # int but Nullable
    'size': np.float64,
    'timestamp': 'str', # Datetime
    'date': 'str' # Datetime
    }

#  pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

users = pd.read_csv(
    path_file,
    sep=';',
    dtype=dtype,
    index_col=0)

# Post traitement allow to manage NaN
users['timestamp'] = pd.to_datetime(users['timestamp'], unit='s')
users["date"] = pd.to_datetime(users["date"], format='%Y-%m-%d %H:%M:%S')

display(users)
display(users.dtypes)

Unnamed: 0_level_0,name,email,age,size,timestamp,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Nolan,nolan@example.com,27.0,1.75,2018-12-06 09:58:49,2018-11-20 10:15:12
1,Evan,evan@example.com,34.0,1.65,NaT,NaT
2,Tess,tess@example.com,26.0,,2018-12-06 09:41:49,2018-11-19 10:10:42
3,Linda,linda@example.com,,1.83,2018-12-06 03:41:49,2018-11-08 18:12:48


name                 object
email                object
age                 float64
size                float64
timestamp    datetime64[ns]
date         datetime64[ns]
dtype: object

## Simple lazy load with dask 

In [177]:
# Lazy load simple
path_file = "data/users_2018_11.csv"

dtype ={
    'name': 'str',
    'email': 'str',
    'age': np.float64,  # int but Nullable
    'size': np.float64,
    'timestamp': 'str', # Datetime
    'date': 'str' # Datetime
    }

dd_users = dd.read_csv(
    path_file,
    sep=';',
    dtype=dtype)

# Compute
users = dd_users.compute()

# Post traitement allow to manage NaN
users['timestamp'] = pd.to_datetime(users['timestamp'], unit='s')
users["date"] = pd.to_datetime(users["date"], format='%Y-%m-%d %H:%M:%S')


display(users)
display(users.dtypes)


Unnamed: 0,id,name,email,age,size,timestamp,date
0,0,Nolan,nolan@example.com,27.0,1.75,2018-12-06 09:58:49,2018-11-20 10:15:12
1,1,Evan,evan@example.com,34.0,1.65,NaT,NaT
2,2,Tess,tess@example.com,26.0,,2018-12-06 09:41:49,2018-11-19 10:10:42
3,3,Linda,linda@example.com,,1.83,2018-12-06 03:41:49,2018-11-08 18:12:48


id                    int64
name                 object
email                object
age                 float64
size                float64
timestamp    datetime64[ns]
date         datetime64[ns]
dtype: object

## Lazy load with processing and condition

In [209]:
# Lazy load simple
path_file = "data/users_2018_11.csv"

dtype ={
    'name': 'str',
    'email': 'str',
    'age': np.float64,  # int but Nullable
    'size': np.float64,
    'timestamp': 'str', # Datetime
    'date': 'str' # Datetime
    }

dd_users = dd.read_csv(
    path_file,
    sep=';',
    dtype=dtype)

# Lazy processing
meta_timestamp = ('timestamp', 'datetime64[ns]')
meta_date = ('date', 'datetime64[ns]')

dd_users.timestamp = dd_users.timestamp.map_partitions(
    pd.to_datetime,
    unit='s',
    meta=meta_timestamp)

dd_users.date = dd_users.date.map_partitions(
    pd.to_datetime, 
    format='%Y-%m-%d %H:%M:%S', # Full example use %Y-%m-%d %H:%M:%S.%f
    meta=meta_date)

# Lazy condition
# - Drop Na row in 'date' column
dd_users = dd_users.dropna(subset=['date'])     

# - Date comparaison between date_after and date_before
date_after=np.datetime64('2018-11-09 12:15:46')
date_before=np.datetime64('2018-11-19 12:15:46')
dd_users = dd_users[(dd_users.date >= date_after) & (dd_users.date < date_before) ] # Time comparison

# Compute
users = dd_users.compute()

display(users)
display(users.dtypes)

Unnamed: 0,id,name,email,age,size,timestamp,date
2,2,Tess,tess@example.com,26.0,,2018-12-06 09:41:49,2018-11-19 10:10:42


id                    int64
name                 object
email                object
age                 float64
size                float64
timestamp    datetime64[ns]
date         datetime64[ns]
dtype: object

## Lazy load multi files

In [211]:
path_file = "data/users_2018_*.csv" # Here is the multi file load tricks

dtype ={
    'name': 'str',
    'email': 'str',
    'age': np.float64,  # int but Nullable
    'size': np.float64,
    'timestamp': 'str', # Datetime
    'date': 'str' # Datetime
    }

dd_users = dd.read_csv(
    path_file,
    sep=';',
    dtype = dtype)

# Will not be monotonically increasing from 0
#  because of the inability to statically know the full length of the index
#  of each partion (ie. each files).
#dd_users.reset_index()

users = dd_users.compute()

display(users)
display(users.dtypes)

Unnamed: 0,id,name,email,age,size,timestamp,date
0,0,Ted,ted@example.com,27.0,1.75,1544090329.0,2018-10-20 10:15:12
1,1,Mary,mary@example.com,34.0,1.65,,
2,2,Alex,alex@example.com,26.0,,1544059309.0,2018-10-19 10:10:42
3,3,Tod,Tod@example.com,,1.83,1544067709.0,2018-10-8 18:12:48
0,0,Nolan,nolan@example.com,27.0,1.75,1544090329.0,2018-11-20 10:15:12
1,1,Evan,evan@example.com,34.0,1.65,,
2,2,Tess,tess@example.com,26.0,,1544089309.0,2018-11-19 10:10:42
3,3,Linda,linda@example.com,,1.83,1544067709.0,2018-11-8 18:12:48


id             int64
name          object
email         object
age          float64
size         float64
timestamp     object
date          object
dtype: object