# Recolher e carregar dados com o pandas

![alt text](http://joaomrcarvalho.github.io/images/Data-collection-in-quantitative-research-4-1024x538.jpg)

In [None]:
import pandas as pd
import numpy as np

## Formatos tradicionais

### Carregar dados .csv

In [None]:
df = pd.DataFrame({'id':[0,1,2,3,4,5],
                   'x1':['A','B','C','D','E','F'],
                   'x2':['a',' ','c','-','e','f'],
                   'x3':[1,2,'3',np.nan,5,6.0]})

print(df)

df.to_csv('file.csv')

df = pd.read_csv('file.csv')

df.to_csv('file.csv',sep=';')

In [None]:
df = pd.read_csv('file.csv')
print(df)
# não seria o esperado

In [None]:
df_1 = pd.read_csv('file.csv',sep=';')
print(df_1)

In [None]:
df_1.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

print(df_1)

In [None]:
df_1.to_csv('file.csv', sep=',', index=False)

In [None]:
df = pd.read_csv('file.csv', index_col='id', header=0)
print(df)

In [None]:
df = pd.read_csv('file.csv', index_col='id', header=0, na_values=['-',' '])

print(df)

### Carregar dados excel

In [None]:
df = pd.read_excel('https://joaomrcarvalho.github.io/datasets/dm/file.xlsx', index_col='id', header=0, na_values=['-',' '])

df.to_excel('file.xlsx')

print(df)

In [None]:
print(df.info())

In [None]:
dtype = {'x1':str, 'x2':object, 'x3':int} 

df = pd.read_excel('file.xlsx', index_col='id', 
                   header=0, na_values=['-',' '], dtype=dtype)

df.info()

print(df)

In [None]:
df = pd.read_excel('file.xlsx', index_col='id', header=0, 
                   na_values=['-',' '], 
                   names=['column_1','column_2','column_3'])

print(df)

### Carregar dados json

In [None]:
df = pd.read_json("https://joaomrcarvalho.github.io/datasets/dm/file_json_1.json")

print(df.head())

print('--------------------------------------------------------------------------------------------------------')

df = pd.read_json("https://joaomrcarvalho.github.io/datasets/dm/file_json_2.json")

print(df.head())

print('--------------------------------------------------------------------------------------------------------')

print(df.info())

In [None]:
df.to_json("file_json_2.json")

df.to_json("file_json_2.gz", compression='gzip')


df = pd.read_json("file_json_2.gz")

print(df)

In [None]:
df = pd.read_json("file_json_2.json")


#{
#    "0":{"category":0,"datetime":1420070400000,"integer":5},
#    "1":{"category":0,"datetime":1420070401000,"integer":5}
#}
df.to_json('file_1.json',orient='index')



#[
#    {"category":0,"datetime":1420070400000,"integer":5},
#    {"category":0,"datetime":1420070401000,"integer":5}
#]
df.to_json('file_2.json',orient='records')

#{
# "category":{"0":0,"1":0},
# "datetime":{"0":1420070400000,"1":1420070401000},
# "integer":{"0":5,"1":5}
#}
df.to_json('file_3.json',orient='columns')

In [None]:
df = pd.read_json("file_json_2.json", encoding='ascii')
print(df.head())
print("--------------------------------------")

df = pd.read_json("file_json_2.json", encoding='latin-1')
print(df.head())
print("--------------------------------------")

df = pd.read_json("file_json_2.json", encoding='utf-8')
print(df.head())

## Formatos para Big Data

### Parquet

In [None]:
# !pip install pyarrow

!wget https://joaomrcarvalho.github.io/datasets/dm/parquet/adult.data
    
df = pd.read_csv('adult.data')

df.to_parquet('adult.snappy.parquet', engine='auto', compression='snappy')

In [None]:
!ls -l

In [None]:
import pyarrow.parquet as pq

data = pq.read_pandas('adult.snappy.parquet', columns=['age','education','capital-gain']).to_pandas()

print(data.head())

In [None]:
data = pq.read_pandas('adult.snappy.parquet',columns=['age','education','capital-gain'])

pq.write_to_dataset(data, root_path='parquet_dataset', partition_cols=['age'])

## Dados web

### HTML

In [None]:
data = pd.read_html('https://joaomrcarvalho.github.io/datasets/dm/web_data/simple_tables.html')

print(data)

print('-------------------------------------------------------------------------')

print(data[0])

print('-------------------------------------------------------------------------')

print(data[1])

In [None]:
data = pd.read_html('https://joaomrcarvalho.github.io/datasets/dm/web_data/simple_tables.html',header=0)

print(data[1])

In [None]:
data = pd.read_html('https://joaomrcarvalho.github.io/datasets/dm/web_data/simple_tables.html',header=0,flavor='bs4')

print(data[1])

In [None]:
data = pd.read_html('https://joaomrcarvalho.github.io/datasets/dm/web_data/simple_tables.html', match="x2", header=0, flavor='bs4')

print(data)

**Exemplo 1 (Wikipedia):**

In [None]:
data = pd.read_html('https://en.wikipedia.org/wiki/Python_(programming_language)', header=0, flavor='bs4')

# escolher a tabela "Summary of Python 3's built-in types" da página
print(data[1])

**Exemplo 2 (Moedas):**

In [None]:
data = pd.read_html('https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList', header=0, flavor='bs4')

print(data[1])

**Sugestão de exercicio**: Escolher outra página da web, carregar os dados da mesma e salvar em formato csv, json ou parquet.

### Aceder a APIs 

In [None]:
import requests
import json

with open('api_example.json','w') as file:
    
    r =  requests.get('https://api.coinmarketcap.com/v1/ticker/?limit=50')
    js = r.json()
    
    json.dump(js,file)
    
data = pd.read_json('api_example.json',orient='records')

print(data.head())

In [None]:
import requests
import json

r =  requests.get('https://api.coinmarketcap.com/v1/ticker/?limit=50')

js = r.json()

data = pd.DataFrame(js)

print(data.head())

In [None]:
data = pd.read_json('https://api.coinmarketcap.com/v1/ticker/?limit=50',orient='records')

print(data.head())