# How to load data

# 0. Example dataset
## Pima Indians Dataset
- A population of women who were at least 21 years old, of Pima Indian heritage and living near Phoenix, Arizona, was tested for diabetes according to World Health Organization criteria.
- The data were collected by the US National Institute of Diabetes and Digestive and Kidney Diseases.
- https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

# 1. Read the data file in local

## Using pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Using pandas.read_csv()
# Pandas는 여러 형태의 파일을 불러들일 수 있습니다.

# 파일에 변수 이름이 같이 있는 경우, header = TRUE를 통해 한 번에 변수명을 불러들일 수 있음.
data = pd.read_csv('./data/pima-indians-diabetes.data', header = None)

In [3]:
# Data의 개략적 형태 확인
# pandas를 이용하였기 때문에, 읽힌 데이터가 DataFrame임을 확인할 수 있음
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# Data의 포인트 수와 변수의 수를 확인
data.shape

(768, 9)

In [7]:
# Data의 변수명을 정의하는 방법
variables = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data.columns = variables

In [8]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [37]:
# pandas.read_csv()에서 미리 변수명을 호출하는 방법도 있음
data2 = pd.read_csv('./data/pima-indians-diabetes.data', names = variables)
data2.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Using numpy

In [14]:
# numpy.loadtxt를 이용
# delimiter를 정의하여 데이터를 불러들임
raw_data = open('./data/pima-indians-diabetes.data', 'r')
data = np.loadtxt(raw_data, delimiter=',')

# 가급적 open을 한 데이터는 close하는 것이 좋음
raw_data.close()

In [11]:
# numpy를 이용해 읽은 데이터는 array임.
data

array([[   6.   ,  148.   ,   72.   , ...,    0.627,   50.   ,    1.   ],
       [   1.   ,   85.   ,   66.   , ...,    0.351,   31.   ,    0.   ],
       [   8.   ,  183.   ,   64.   , ...,    0.672,   32.   ,    1.   ],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,    0.245,   30.   ,    0.   ],
       [   1.   ,  126.   ,   60.   , ...,    0.349,   47.   ,    1.   ],
       [   1.   ,   93.   ,   70.   , ...,    0.315,   23.   ,    0.   ]])

In [15]:
data.shape

(768, 9)

# 2. Read the data file from a URL

## Using pandas 

In [43]:
# pandas는 매우 쉽게 url로부터 데이터를 읽어들임
url = "https://goo.gl/vhm1eU"
data = pd.read_csv(url, names=variab)

In [44]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
data.shape

(768, 9)

## Using numpy 

In [34]:
# urllib.request.urlopen을 통해 url의 데이터를 읽을 수 있음
import urllib
raw_data = urllib.request.urlopen(url)
data = np.loadtxt(raw_data, delimiter=',')
raw_data.close()

In [35]:
data

array([[   6.   ,  148.   ,   72.   , ...,    0.627,   50.   ,    1.   ],
       [   1.   ,   85.   ,   66.   , ...,    0.351,   31.   ,    0.   ],
       [   8.   ,  183.   ,   64.   , ...,    0.672,   32.   ,    1.   ],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,    0.245,   30.   ,    0.   ],
       [   1.   ,  126.   ,   60.   , ...,    0.349,   47.   ,    1.   ],
       [   1.   ,   93.   ,   70.   , ...,    0.315,   23.   ,    0.   ]])

In [36]:
data.shape

(768, 9)