## Pandas数据类型简介

### Pandas数据类型简介


In [1]:
# Numpy使用
# 用ndarray进行存储

import numpy as np
# 创建ndarray
score = np.array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])

In [None]:
# ndarray与Python原生list运算效率对比

In [3]:
import random
import time
import numpy as np
a = []
for i in range(100000000):
    a.append(random.random())


In [4]:
t1 = time.time()
sum1=sum(a)
t2=time.time()



In [5]:
b=np.array(a)
t4=time.time()
sum3=np.sum(b)
t5=time.time()


In [6]:
print(t2-t1, t5-t4)

0.44531798362731934 0.0919642448425293


### ndarray介绍


In [7]:
score

array([[80, 89, 86, 67, 79],
       [78, 97, 89, 67, 81],
       [90, 94, 78, 67, 74],
       [91, 91, 90, 67, 69],
       [76, 87, 75, 67, 86],
       [70, 79, 84, 67, 84],
       [94, 92, 93, 67, 64],
       [86, 85, 83, 67, 80]])

In [None]:
### 属性

In [8]:
score.dtype

dtype('int64')

In [9]:
score.size

40

In [10]:
score.shape

(8, 5)

In [11]:
score.ndim

2

In [1]:
# ndarray的形状


In [2]:
a = np.array([[1,2,3],[4,5,6]])
b = np.array([1,2,3,4])
c = np.array([[[1,2,3],[4,5,6]],[[1,2,3],[4,5,6]]])



In [3]:
#打印形状
a.shape


(2, 3)

In [4]:
b.shape


(4,)

In [5]:
c.shape

(2, 2, 3)

In [6]:
# 创建数组的时候指定类型
a = np.array([[1, 2, 3],[4, 5, 6]], dtype=np.float32)
a.dtype


dtype('float32')

### Pandas的数据类型

In [8]:
# 查看数据类型
import pandas as pd
import seaborn as sns
tips = sns.load_dataset('tips')
tips.dtypes


total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

## 类型转换

### astype方法是通用函数

In [10]:
# astype方法是通用函数，可用于把DataFrame中的任何列转换为其他dtype
# 可以向astype方法提供任何内置类型或numpy类型来转换列的数据类型
# 把total_bill转换成字符串
tips['total_bill'] = tips['total_bill'].astype(str)
tips.dtypes


total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [11]:
#把total_bill转换回float类型
tips['total_bill'] = tips['total_bill'].astype(float)
tips.dtypes


total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object


### 转换为数值类型

In [12]:


#创造包含'missing'为缺失值的数据
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7],'total_bill'] = 'missing'
tips_sub_miss



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,missing,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,missing,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,missing,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,missing,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [13]:
#查看数据类型 dtypes 会发现total_bill列变成了字符串对象类型
tips_sub_miss.dtypes	




total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [14]:
# 对上面的数据集使用astype方法把total_bill 列转换回float类型,会抛错
# Pandas 无法把'missing'转换成float
tips_sub_miss['total_bill'].astype(float)


ValueError: could not convert string to float: 'missing'

In [15]:
# 如果使用Pandas库中的to_numeric函数进行转换,也会得到类似的错误
pd.to_numeric(tips_sub_miss['total_bill'])


ValueError: Unable to parse string "missing" at position 1

In [16]:
pd.to_numeric(tips_sub_miss['total_bill'],errors = 'ignore') 

0      16.99
1    missing
2      21.01
3    missing
4      24.59
5    missing
6       8.77
7    missing
8      15.04
9      14.78
Name: total_bill, dtype: object

In [17]:
pd.to_numeric(tips_sub_miss['total_bill'],errors = 'coerce')


0    16.99
1      NaN
2    21.01
3      NaN
4    24.59
5      NaN
6     8.77
7      NaN
8    15.04
9    14.78
Name: total_bill, dtype: float64

In [19]:
# to_numeric向下转型

pd.to_numeric(tips_sub_miss['total_bill'],errors = 'coerce',downcast='float')


0    16.99
1      NaN
2    21.01
3      NaN
4    24.59
5      NaN
6     8.77
7      NaN
8    15.04
9    14.78
Name: total_bill, dtype: float32

## 类别数据

### category介绍

In [20]:
# 转换为category类型
tips['sex'] = tips['sex'].astype('str')
tips.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
dtypes: category(3), float64(2), int64(1), object(1)
memory usage: 8.8+ KB


In [21]:
tips['sex'] = tips['sex'].astype('category')
tips.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


### 利用category排序

In [22]:
s = pd.Series(pd.Categorical(["a", "b", "c", "d"],categories=["c", "b", "a"]))


In [23]:
s

0      a
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [c, b, a]

In [24]:
# category排序
# 准备数据
# 创建categorical型Series
series_cat = pd.Series(['B','D','C','A'], dtype='category')


In [25]:
series_cat

0    B
1    D
2    C
3    A
dtype: category
Categories (4, object): [A, B, C, D]

In [27]:

# 对数据排序
series_cat.sort_values()


3    A
0    B
2    C
1    D
dtype: category
Categories (4, object): [A, B, C, D]

In [28]:
# 指定顺序 CategoricalDtype
from pandas.api.types import CategoricalDtype
cat = CategoricalDtype(categories=['B','D','A','C'],ordered=True)
series_cat1 = series_cat.astype(cat)
series_cat.sort_values()


3    A
0    B
2    C
1    D
dtype: category
Categories (4, object): [A, B, C, D]

In [30]:
series_cat1.sort_values()


0    B
1    D
3    A
2    C
dtype: category
Categories (4, object): [B < D < A < C]

In [31]:
# 想要临时修改排序规则，可以使用.cat.reorder_categories()方法：
series_cat.cat.reorder_categories(['D','B','C','A'],ordered=True,
                                  inplace=True)#inplace参数设置为True使得变动覆盖原数据
series_cat.sort_values()

1    D
0    B
2    C
3    A
dtype: category
Categories (4, object): [D < B < C < A]