### series

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)

print(data.values, type(data.values))
print(data.index, type(data.index))


0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
[0.25 0.5  0.75 1.  ] <class 'numpy.ndarray'>
RangeIndex(start=0, stop=4, step=1) <class 'pandas.core.indexes.range.RangeIndex'>


### series索引

In [None]:
# series 索引类似字典，index相当于key

In [5]:
data = pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])
print(data)
data2 = pd.Series(np.random.rand(5), index = np.arange(5))
print(data2)

print(data['a':'c']) #data['a':'c']会包含索引为'a'到'c'的所有元素，包括'c'
print(data[0:2])

a    1
b    2
c    3
d    4
dtype: int64
0    0.041483
1    0.477991
2    0.253460
3    0.524919
4    0.008085
dtype: float64
a    1
b    2
c    3
dtype: int64
a    1
b    2
dtype: int64


In [None]:
# 基于标签的索引（比如Pandas中的DataFrame或Series对象）会包含左右边界的元素

## DataFrame

In [7]:
area_dict = {"california":"cf", "Texas":"TX", "Newyork":"ny", "florida":"frd", "ollinois":"oll"}
area = pd.Series(area_dict)
print(area)

population = pd.Series({"california":1, "Texas":2, "Newyork":3, "florida":4})
state = pd.DataFrame({"population":population, "area":area}) # 以Series作为新字典的value，得到DataFrame

print(state)
print(state.index)
print(state.columns)

california     cf
Texas          TX
Newyork        ny
florida       frd
ollinois      oll
dtype: object
            population area
Newyork            3.0   ny
Texas              2.0   TX
california         1.0   cf
florida            4.0  frd
ollinois           NaN  oll
Index(['Newyork', 'Texas', 'california', 'florida', 'ollinois'], dtype='object')
Index(['population', 'area'], dtype='object')


### series DataFrame选择/索引

In [13]:
test = pd.Series({"a":1, "b":2, "c":3, "d":4})
print(test)
print(test[0:2]) # 显示索引，左闭右开
print(test["a":"c"]) # 隐式索引, 左右闭合
print(test[["a", "c"]]) #花式索引  记得连个[]


a    1
b    2
c    3
d    4
dtype: int64
a    1
b    2
dtype: int64
a    1
b    2
c    3
dtype: int64
a    1
c    3
dtype: int64


#### Series的字典性质

In [18]:
print(test.keys())
print(test.items())
print(test.values)
print(test.index)


Index(['a', 'b', 'c', 'd'], dtype='object')
<zip object at 0x000002D3663CA2C0>
[1 2 3 4]
Index(['a', 'b', 'c', 'd'], dtype='object')


#### dataframe

##### 字典性质

In [27]:
# print(data["area"])
# print(data)
print(state)
print("state.values\n", state.values) # 返回二维数组
print(state.index)
print(state.ndim)

series = state["area"]
print(series is state["area"]) # 同一空间
print("state[\"area\"]\n", state["area"])
print("series.values\n", series.values)
print("series.index\n", series.index)
print(series.ndim)


            population area
Newyork            3.0   ny
Texas              2.0   TX
california         1.0   cf
florida            4.0  frd
ollinois           NaN  oll
state.values
 [[3.0 'ny']
 [2.0 'TX']
 [1.0 'cf']
 [4.0 'frd']
 [nan 'oll']]
Index(['Newyork', 'Texas', 'california', 'florida', 'ollinois'], dtype='object')
2
True
state["area"]
 Newyork        ny
Texas          TX
california     cf
florida       frd
ollinois      oll
Name: area, dtype: object
series.values
 ['ny' 'TX' 'cf' 'frd' 'oll']
series.index
 Index(['Newyork', 'Texas', 'california', 'florida', 'ollinois'], dtype='object')
1


##### 二维数组性质

In [33]:
print(state)
print(state.iloc[:3, :2])
print(state.loc[:"Texas", :"florida"]) # 本质时loc可以接收两个列表 
# 两个列表可以是花式（指定的keys），条件判断（条件表达式）  区间（:）
print(state[:"Texas"][:"florida"]) #注意，有无loc时 索引方式不同
print(state.T)


            population area
Newyork            3.0   ny
Texas              2.0   TX
california         1.0   cf
florida            4.0  frd
ollinois           NaN  oll
            population area
Newyork            3.0   ny
Texas              2.0   TX
california         1.0   cf
         population
Newyork         3.0
Texas           2.0
         population area
Newyork         3.0   ny
Texas           2.0   TX
           Newyork Texas california florida ollinois
population     3.0   2.0        1.0     4.0      NaN
area            ny    TX         cf     frd      oll


## 检测与处理重复 缺失值

### 删除缺失值

In [37]:
df = pd.DataFrame([["a", np.nan, 2],
                  [2, 3, 5],
                  [np.nan, 4, 6]])
print(df)
df.dropna()   # 默认axis = 0 how = any,   对象为所有行  有np.nan就删除
print(df)

df[3] = np.nan
print(df)
df = df.dropna(axis = 1, how = "all") # 不会改变原始值，而是返回新值
print(df)

     0    1  2
0    a  NaN  2
1    2  3.0  5
2  NaN  4.0  6
     0    1  2
0    a  NaN  2
1    2  3.0  5
2  NaN  4.0  6
     0    1  2   3
0    a  NaN  2 NaN
1    2  3.0  5 NaN
2  NaN  4.0  6 NaN
     0    1  2
0    a  NaN  2
1    2  3.0  5
2  NaN  4.0  6


### 填充缺失值

In [39]:
data = pd.Series([1, np.nan, None, 2, 3], index = list("abcde"))
print(data)
data.fillna(0)  # 不会改变原有值，只会返回新值
data = data.fillna(method = "ffill") # 用前值填充
print(data)

a    1.0
b    NaN
c    NaN
d    2.0
e    3.0
dtype: float64
a    1.0
b    1.0
c    1.0
d    2.0
e    3.0
dtype: float64


### pd.merge() 合并函数

In [10]:
# print(state[])
print(state["Newyork"])
# print(state[

KeyError: 'Newyork'