# Read Data

## Pandas Introduction

In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 3, 6, np.nan, 44, 1])

In [3]:
s

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

In [4]:
dates = pd.date_range('20160101', periods = 6)
dates

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index = dates
                   , columns = ['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
2016-01-01,-0.011795,0.503747,-1.450865,1.032197
2016-01-02,1.435253,-0.513001,0.511526,0.891149
2016-01-03,-0.530293,-1.113945,0.89421,-1.395363
2016-01-04,1.320742,1.295067,-0.137235,0.331395
2016-01-05,1.852711,-1.19573,0.909691,-0.32035
2016-01-06,0.019564,0.011369,-0.190274,-0.545362


In [6]:
df = pd.DataFrame(np.arange(12).reshape(3,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [7]:
df2 = pd.DataFrame({'A':1.,
                   'B':pd.Timestamp('20130102'),
                   'C':pd.Series(1, index = list(range(4))),
                   'D':np.array([3]*4),
                   'E':pd.Categorical(['test', 'train', 'test', 'train']),
                   'F':'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1,3,test,foo
1,1.0,2013-01-02,1,3,train,foo
2,1.0,2013-01-02,1,3,test,foo
3,1.0,2013-01-02,1,3,train,foo


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C             int64
D             int64
E          category
F            object
dtype: object

In [9]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

## Read Data

|Data type|Introduction|Pandas|
|----|----|----|
|csv、tsv、txt|text|pd.read_csv|
|excel|Microsoft xls or xlsx|pd.read_excel|
|mysql|database|pd.read_sql|

### Read Csv

In [10]:
path = 'datas/ml-latest-small/ratings.csv'

In [11]:
data1 = pd.read_csv(path)

In [12]:
# View the head few rows
data1.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


###### View the size

In [13]:
data1.shape

(100836, 4)

###### View the name of columns and rows

In [14]:
data1.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [15]:
data1.index

RangeIndex(start=0, stop=100836, step=1)

###### View the data type of each columns

In [16]:
data1.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Read txt

In [17]:
path2 = 'datas/crazyant/access_pvuv.txt'

In [18]:
data2 = pd.read_csv(
    path2,
    sep = '\t',
    header = None,
    names = ['pdate', 'pv', 'uv']
)

data2

Unnamed: 0,pdate,pv,uv
0,2019-09-10,139,92
1,2019-09-09,185,153
2,2019-09-08,123,59
3,2019-09-07,65,40
4,2019-09-06,157,98
5,2019-09-05,205,151
6,2019-09-04,196,167
7,2019-09-03,216,176
8,2019-09-02,227,148
9,2019-09-01,105,61


### Read Excel

In [19]:
path3 = 'datas/crazyant/access_pvuv.xlsx'

In [20]:
data3 = pd.read_excel(path3)
data3

Unnamed: 0,日期,PV,UV
0,2019-09-10,139,92
1,2019-09-09,185,153
2,2019-09-08,123,59
3,2019-09-07,65,40
4,2019-09-06,157,98
5,2019-09-05,205,151
6,2019-09-04,196,167
7,2019-09-03,216,176
8,2019-09-02,227,148
9,2019-09-01,105,61
