In [1]:
import numpy as np

## Basic File IO using numpy
- There are two broad IO categories
    1. When the data to be read/written is binary
    2. When the data to be read/written is  text

In [2]:
## Reading data from text files
data = np.genfromtxt("./data/numpy_data.csv",delimiter=",") ## this will work only if the file has no header.

In [3]:
np.genfromtxt('./data/np_data_mixed.csv',delimiter=",",skip_header=1,dtype=None,encoding=None) ### if the types are mixed then, one will need to set dtype to None

array([('M', 24., 180., 80.), ('M', 23., 170., 60.),
       ('M', 24.,  nan, nan), ..., ('M', 27., 176., 59.),
       ('M', 30., 185., 96.), ('M', 34., 185., 96.)],
      dtype=[('f0', '<U1'), ('f1', '<f8'), ('f2', '<f8'), ('f3', '<f8')])

Its more popular to use numpy to store data, as it can compress it in more compact form

In [4]:
import pandas as pd
big_file = pd.read_csv('./data/big_np_data.csv',header=None)

In [5]:
big_file.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [6]:
npz = big_file.values

In [7]:
with open("./data/uncompressed.npz",'wb') as f:
    np.savez(f,npz)

In [8]:
with open("./data/compressed.npz",'wb') as f:
    np.savez_compressed(f,npz)

In [9]:
import os
os.path.getsize('./data/uncompressed.npz')

25484422

In [10]:
os.path.getsize('./data/compressed.npz')

5429286

In [11]:
os.path.getsize('./data/big_np_data.csv')

36543581

As you can see the size of files saved in `npz` format is drastically reduced.

#### Reading binary data using numpy

In [12]:
dat = np.load('./data/compressed.npz',allow_pickle=True)

In [13]:
dat.files

['arr_0']

In [14]:
dat['arr_0']

array([[1, 'A Dijiang', 'M', ..., 'Basketball',
        "Basketball Men's Basketball", nan],
       [2, 'A Lamusi', 'M', ..., 'Judo', "Judo Men's Extra-Lightweight",
        nan],
       [3, 'Gunnar Nielsen Aaby', 'M', ..., 'Football',
        "Football Men's Football", nan],
       ...,
       [135570, 'Piotr ya', 'M', ..., 'Ski Jumping',
        "Ski Jumping Men's Large Hill, Team", nan],
       [135571, 'Tomasz Ireneusz ya', 'M', ..., 'Bobsleigh',
        "Bobsleigh Men's Four", nan],
       [135571, 'Tomasz Ireneusz ya', 'M', ..., 'Bobsleigh',
        "Bobsleigh Men's Four", nan]], dtype=object)

#### Class Excercise
Use this [url](http://api.worldbank.org/v2/countries/IND/indicators/NY.GDP.MKTP.CD?per_page=5000&format=json) to fetch data on India's gdp. Store this data in a compressed npz format. You need to save the year as well as gdp numbers. 

## Manipulating Numpy Data

In [15]:
type(data)

numpy.ndarray

In [16]:
data

array([[ 24., 180.,  80.],
       [ 23., 170.,  60.],
       [ 24.,  nan,  nan],
       ...,
       [ 27., 176.,  59.],
       [ 30., 185.,  96.],
       [ 34., 185.,  96.]])

In [17]:
## Indexation
data.shape

(271116, 3)

In [18]:
data[:,0]## all rows 0th col

array([24., 23., 24., ..., 27., 30., 34.])

In [19]:
data[:,0:2] ## all rows and upto 2nd column

array([[ 24., 180.],
       [ 23., 170.],
       [ 24.,  nan],
       ...,
       [ 27., 176.],
       [ 30., 185.],
       [ 34., 185.]])

In [20]:
data[...,0] ## all rows 0th col

array([24., 23., 24., ..., 27., 30., 34.])

In [21]:
data[...,0:2] ## all rows and upto 2nd column

array([[ 24., 180.],
       [ 23., 170.],
       [ 24.,  nan],
       ...,
       [ 27., 176.],
       [ 30., 185.],
       [ 34., 185.]])

In [22]:
## boolean indexation
np.isnan(data[:,1])

array([False, False,  True, ..., False, False, False])

In [23]:
data

array([[ 24., 180.,  80.],
       [ 23., 170.,  60.],
       [ 24.,  nan,  nan],
       ...,
       [ 27., 176.,  59.],
       [ 30., 185.,  96.],
       [ 34., 185.,  96.]])

In [24]:
data[:,1][np.isnan(data[:,1])]

array([nan, nan, nan, ..., nan, nan, nan])

In [25]:
data[:,1][~np.isnan(data[:,1])]

array([180., 170., 185., ..., 176., 185., 185.])