# Pandas 菜鸟教程

### 创建 DataFrame

In [4]:
import pandas as pd

# 创建一个简单的 DataFrame
data = {'Name': ['Google', 'Runoob', 'Taobao'], 'Age': [25, 30, 35]}
df = pd.DataFrame(data)

# 查看 DataFrame
print(df)

     Name  Age
0  Google   25
1  Runoob   30
2  Taobao   35


### 创建 series

In [2]:
import pandas as pd

# 创建一个Series对象，指定名称为'A'，值分别为1, 2, 3, 4
# 默认索引为0, 1, 2, 3
series = pd.Series([1, 2, 3, 4], name='A')

# 显示Series对象
print(series)

# 如果你想要显式地设置索引，可以这样做：
custom_index = [1, 2, 3, 4]  # 自定义索引
series_with_index = pd.Series([1, 2, 3, 4], index=custom_index, name='A')

# 显示带有自定义索引的Series对象
print(series_with_index)

0    1
1    2
2    3
3    4
Name: A, dtype: int64
1    1
2    2
3    3
4    4
Name: A, dtype: int64


### series转换为DataFrame

In [3]:
# 转换
trans=pd.DataFrame(series)
print(trans)

   A
0  1
1  2
2  3
3  4


### 使用列表创建 DataFrame
列表没有index和columns, 可以指定名字

In [17]:
import pandas as pd

data = [['Google', 10], ['Runoob', 12], ['Wiki', 13]]

# 创建DataFrame
df = pd.DataFrame(data, columns=['Site', 'Age'])

# 使用astype方法设置每列的数据类型
#df['Site'] = df['Site'].astype(str)
df['Age'] = df['Age'].astype(float)

print(df)
print(type(df['Site'][0]))
print(type(df['Age'][0]))

     Site   Age
0  Google  10.0
1  Runoob  12.0
2    Wiki  13.0
<class 'str'>
<class 'numpy.float64'>


In [20]:
import pandas as pd

data = {'Site':['Google', 'Runoob', 'Wiki'], 'Age':[10, 12, 13]}

# 需要columns为空,才能指定columns
df = pd.DataFrame(data)

print (df)
print(type(df['Site'][0]))
print(type(df['Age'][0]))

     Site  Age
0  Google   10
1  Runoob   12
2    Wiki   13
<class 'str'>
<class 'numpy.int64'>


In [16]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

# 数据载入到 DataFrame 对象
df = pd.DataFrame(data)

print(df)
# 返回第一行和第二行
print(df.loc[[0, 1]])

   calories  duration
0       420        50
1       380        40
2       390        45
   calories  duration
0       420        50
1       380        40


In [22]:
import pandas as pd

# 创建 DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}
df = pd.DataFrame(data)

# 查看前两行数据
print(df.head(2))

# 查看 DataFrame 的基本信息
print(df.info())

# 获取描述统计信息
print(df.describe())

# 按年龄排序
df_sorted = df.sort_values(by='Age', ascending=False)
print(df_sorted)

# 选择指定列
print(df[['Name', 'Age']])

# 按索引选择行
print(df.iloc[1:3])  # 选择第二到第三行（按位置）

# 按标签选择行
print(df.loc[1:2])  # 选择第二到第三行（按标签）

# 计算分组统计（按城市分组，计算平均年龄）
print(df.groupby('City')['Age'].mean())

# 处理缺失值（填充缺失值）
df['Age'] = df['Age'].fillna(30)

# 导出为 CSV 文件
df.to_csv('output.csv', index=False)

    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
<class 'pandas.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Name    4 non-null      str  
 1   Age     4 non-null      int64
 2   City    4 non-null      str  
dtypes: int64(1), str(2)
memory usage: 228.0 bytes
None
             Age
count   4.000000
mean   32.500000
std     6.454972
min    25.000000
25%    28.750000
50%    32.500000
75%    36.250000
max    40.000000
      Name  Age         City
3    David   40      Houston
2  Charlie   35      Chicago
1      Bob   30  Los Angeles
0    Alice   25     New York
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40
      Name  Age         City
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
      Name  Age         City
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
City
Chicago        35.0
Houston        40.0
Lo