In [None]:
# 导包
import pandas as pd
import chardet
import numpy as np
import os
import sys

# 处理中文显示乱码问题
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['Arial Unicode MS']
mpl.rcParams['axes.unicode_minus'] = False

# 1.Pandas初体验

### 1.1 折线图绘制

In [None]:
with open('data/globalGDPData.csv', 'rb') as f:
    byteData = f.read();
    data = chardet.detect(byteData)
    print(data.get('encoding'))
df = pd.read_csv('data/globalGDPData.csv',encoding=data.get('encoding'))

df

In [None]:
# 读取中国数据
china_df = df[df.country == '中国']
china_df

In [None]:
# 设置year为索引列, inplace=True表示不返回新的表,修改原始表
china_df.set_index('year', inplace=True)
china_df

In [None]:
# 绘制折线图
# china_df.GDP.plot();

In [None]:
# 绘制中美俄, 三国折线图
usa_df = df[df.country == '美国'].set_index('year')
ru_df = df[df.country == '俄罗斯'].set_index('year')
usa_df
ru_df

In [None]:
# 绘制三国折线图
china_df.GDP.plot()
usa_df.GDP.plot()
ru_df.GDP.plot()

### 1.2绘制三国GDP折线图, 加入图例 -> 拼音

In [None]:
china_df.rename(columns={"GDP":'中国'}, inplace=True)
usa_df.rename(columns={"GDP":'美国'}, inplace=True)
ru_df.rename(columns={"GDP":'俄罗斯'}, inplace=True)
china_df

In [None]:
# 设置图例
china_df.中国.plot(legend=True)
usa_df.美国.plot(legend=True)
ru_df.俄罗斯.plot(legend=True)

# 2.Series

### 2.1 创建series对象

In [None]:
# 自定义索引的series对象
s1 = pd.Series(data=[1,2,3,4,5],index=['a','b','c','d','e']);
# 以元组形式创建的series对象
s2 = pd.Series((1,2,3,4,5))
# 以字典形式创建series对象
s3 = pd.Series({0:1,1:2,2:3,3:4,4:5})
s1
s2
s3
# 将np的array对象转为Series对象
s4 = pd.Series(np.arange(5))
s4

### 2.2 Series对象操作

In [None]:
s5 = pd.Series(data=[i for i in range(6)], index=[i for i in "ABCDEF"])
s5
# 获取series对象的索引
s5.index
# 获取series对象的值
s5.values
# 根据索引获取值
print(s5['D'])

# 3.DataFrame

### 3.1创建DataFrame对象

In [None]:
# 通过字典+列表的方式创建
data = {
    "name": ["hillary", "trump", "clinton", "lincoln"],
    "gender": ["woman", "man", "man", "man"],
    "age": [18,20,30,50]
}
df1 = pd.DataFrame(data=data)
df1


In [None]:
# 通过列表+元组的方式创建
data1 = [
    ("hillary", "woman", 18),
    ("trump", "man", 20),
    ("clinton", "man", 30),
    ("lincoln", "man", 50)
]
df2 = pd.DataFrame(data=data1, columns=["name", "gender", "age"])
df2

In [None]:
# 通过np.ndarray创建pd.DataFrame
arr1 = np.arange(12).reshape(3,4);
df3 = pd.DataFrame(data=arr1, columns=["a","b","c","d"])
df3

### 3.2DataFrame案例

In [None]:
# 生成十个学生, 5门功课的成绩, 成绩范围(40~100)
score_df = pd.DataFrame(np.random.randint(40,101,(10,5)))
score_df

In [None]:
column_names = ["语文","数学","英语","历史","地理"]
index_names = ["同学" + str(i) for i in range(score_df.shape[0])]

score_df.__len__()
score_df.shape[0]

# score_df.columns = column_names
# score_df.index = index_names

score_df

In [None]:
# rename函数
# score_df.rename(index={0: "同学0000",1:"同学1111"},
#                 columns={0:"Java", 3:"Python"},
#                 inplace=True)

score_df.rename(index={i:"同学" + str(i) for i in range(score_df.shape[0])},
                columns={i:column_names[i] for i in range(score_df.shape[1])},
                inplace=True)

score_df

### 3.3DataFrame基本属性

In [None]:
# shape 获取维度, 即行列数
score_df.shape
# index 获取索引
score_df.index
# columns 获取列名
score_df.columns
# data 获取数据值
score_df.values
# T 行列转置
score_df.T

### 3.4DataFrame基本函数

In [None]:
# 获取前n行数据
score_df.head(3)
# 获取后n行数据
score_df.tail(3)
# 查看df对象的详细信息
# score_df.info()
# 查看对象的统计信息
score_df.describe()
# 重置索引列, drop=False默认值, 表示不删除原索引列
score_df.reset_index(drop=False)
# 设置索引列
score_df.set_index("语文")
# 设置组合索引列
score_df.set_index(["语文","数学"])

### 3.5Pandas数据类型

In [None]:
# nan类型
data2 = {
    "name": [np.nan, "trump", "clinton", "lincoln"],
    "gender": ["woman", np.nan, "man", "man"],
    "age": [18,20,np.nan,50],
}
df4 = pd.DataFrame(data=data2)
df4


In [None]:
# datetime类型
df5 = pd.DataFrame(data=['2025-03-29','2025-03-30','2025-03-31'],
                   dtype='datetime64[ns]')
df5
df5.dtypes

In [None]:
# timedelta类型
start_date = pd.to_datetime('1999-03-16')
end_date = pd.to_datetime('2025-11-24')
end_date - start_date
type(end_date - start_date)

In [None]:
# category类型
s6 = pd.Series(["男","女","保密"])
s7 = pd.Series(["男","女","保密"], dtype='category')
print(s6)
print(type(s6))
print(s6.dtypes)
print('\n')
print(s7)
print(type(s7))
print(s7.dtypes)

### 3.5Pandas索引操作

In [None]:
# 加载数据
df = pd.read_csv('./data/stock_day.csv')
df

In [None]:
# 删除不需要的字段 axis 0 表示列 1 表示行 默认就是0
df.drop(columns=['ma5','ma10','ma20','v_ma5','v_ma10','v_ma20'], axis=0)
df
# df.info()
# df.describe()

In [None]:
# 根据行列索引获取元素, 先列后行
# type(df)
# type(df['open'])
df['open']
df['open']['2018-02-26']

In [None]:
# loc 根据标签定位(index label)
# iloc 根据索引定位(index position)
df

print(df.loc['2018-02-27', 'high'])
# 等价于
print(df.iloc[0,1])

# 等价于
print(df.loc['2018-02-27':'2018-02-14', ['open','high']])
# 等价于
print(df.iloc[0:5, 0:2])
# 等价于
print(df.iloc[[0,1,2,3,4], [0,1]])