### 参考：https://blog.csdn.net/yiyele/article/details/80605909

In [1]:
import numpy as np 
import pandas as pd

### 1 从CSV或者xlsx文件导入DataFrame

In [2]:
df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv')
df_test = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv')
print(type(df_train))

<class 'pandas.core.frame.DataFrame'>


In [3]:
df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']]
df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]
print(type(df_test_negative))

<class 'pandas.core.frame.DataFrame'>


### 2 用pandas创建数据表

In [4]:
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], 
 "date":pd.date_range('20130102', periods=6),
  "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
 "age":[23,44,54,32,34,32],
 "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
  "price":[1200,np.nan,2133,5433,np.nan,4432]},
  columns =['id','date','city','category','age','price'])

print(df)

     id       date         city category  age   price
0  1001 2013-01-02     Beijing     100-A   23  1200.0
1  1002 2013-01-03           SH    100-B   44     NaN
2  1003 2013-01-04   guangzhou     110-A   54  2133.0
3  1004 2013-01-05     Shenzhen    110-C   32  5433.0
4  1005 2013-01-06     shanghai    210-A   34     NaN
5  1006 2013-01-07     BEIJING     130-F   32  4432.0


### 3 数据表基本信息(元数据)

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes
None


In [6]:
# 维度
print(df.shape)

(6, 6)


In [7]:
# 查看列名称
print(df.columns)

# 每一列数据的格式
print(df.dtypes)

Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')
id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object


In [8]:
# 某一列格式
print(df['id'].dtype)

# 查看某一列空值
print(df['price'].isnull())

# 查看某一列的唯一值
print(df['age'].unique())

int64
0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool
[23 44 54 32 34]


### 4 查看表数据

In [9]:
# 是否空值
print(df.isnull())

      id   date   city  category    age  price
0  False  False  False     False  False  False
1  False  False  False     False  False   True
2  False  False  False     False  False  False
3  False  False  False     False  False  False
4  False  False  False     False  False   True
5  False  False  False     False  False  False


In [10]:
# 查看前10行数据、后10行数据：
df.head() #默认前10行数据 
df.tail() #默认后10 行数据

Unnamed: 0,id,date,city,category,age,price
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [11]:
# 查看数据表的值
print(df.values)

[[1001 Timestamp('2013-01-02 00:00:00') 'Beijing ' '100-A' 23 1200.0]
 [1002 Timestamp('2013-01-03 00:00:00') 'SH' '100-B' 44 nan]
 [1003 Timestamp('2013-01-04 00:00:00') ' guangzhou ' '110-A' 54 2133.0]
 [1004 Timestamp('2013-01-05 00:00:00') 'Shenzhen' '110-C' 32 5433.0]
 [1005 Timestamp('2013-01-06 00:00:00') 'shanghai' '210-A' 34 nan]
 [1006 Timestamp('2013-01-07 00:00:00') 'BEIJING ' '130-F' 32 4432.0]]


### 5 数据表清洗

In [12]:
#用数字0填充空值：
newdf=df.fillna(value=0)
print(df)
print(newdf)

     id       date         city category  age   price
0  1001 2013-01-02     Beijing     100-A   23  1200.0
1  1002 2013-01-03           SH    100-B   44     NaN
2  1003 2013-01-04   guangzhou     110-A   54  2133.0
3  1004 2013-01-05     Shenzhen    110-C   32  5433.0
4  1005 2013-01-06     shanghai    210-A   34     NaN
5  1006 2013-01-07     BEIJING     130-F   32  4432.0
     id       date         city category  age   price
0  1001 2013-01-02     Beijing     100-A   23  1200.0
1  1002 2013-01-03           SH    100-B   44     0.0
2  1003 2013-01-04   guangzhou     110-A   54  2133.0
3  1004 2013-01-05     Shenzhen    110-C   32  5433.0
4  1005 2013-01-06     shanghai    210-A   34     0.0
5  1006 2013-01-07     BEIJING     130-F   32  4432.0


In [13]:
#使用列price的均值对NA进行填充
df['price']=df['price'].fillna(df['price'].mean())
print(df)

     id       date         city category  age   price
0  1001 2013-01-02     Beijing     100-A   23  1200.0
1  1002 2013-01-03           SH    100-B   44  3299.5
2  1003 2013-01-04   guangzhou     110-A   54  2133.0
3  1004 2013-01-05     Shenzhen    110-C   32  5433.0
4  1005 2013-01-06     shanghai    210-A   34  3299.5
5  1006 2013-01-07     BEIJING     130-F   32  4432.0


In [14]:
#清除city字段的字符空格
df['city']=df['city'].map(str.strip)
print(df)

     id       date       city category  age   price
0  1001 2013-01-02    Beijing    100-A   23  1200.0
1  1002 2013-01-03         SH    100-B   44  3299.5
2  1003 2013-01-04  guangzhou    110-A   54  2133.0
3  1004 2013-01-05   Shenzhen    110-C   32  5433.0
4  1005 2013-01-06   shanghai    210-A   34  3299.5
5  1006 2013-01-07    BEIJING    130-F   32  4432.0


In [15]:
#大小写转换
df['city']=df['city'].str.lower()
print(df)

     id       date       city category  age   price
0  1001 2013-01-02    beijing    100-A   23  1200.0
1  1002 2013-01-03         sh    100-B   44  3299.5
2  1003 2013-01-04  guangzhou    110-A   54  2133.0
3  1004 2013-01-05   shenzhen    110-C   32  5433.0
4  1005 2013-01-06   shanghai    210-A   34  3299.5
5  1006 2013-01-07    beijing    130-F   32  4432.0


In [16]:
#更改数据格式
df['price'].astype('int')

0    1200
1    3299
2    2133
3    5433
4    3299
5    4432
Name: price, dtype: int64

In [17]:
# 更改列名称
df.rename(columns={'category': 'category-size'})

Unnamed: 0,id,date,city,category-size,age,price
0,1001,2013-01-02,beijing,100-A,23,1200.0
1,1002,2013-01-03,sh,100-B,44,3299.5
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,3299.5
5,1006,2013-01-07,beijing,130-F,32,4432.0


In [18]:
#删除后出现的重复值
df['city'].drop_duplicates()

0      beijing
1           sh
2    guangzhou
3     shenzhen
4     shanghai
Name: city, dtype: object

In [19]:
#删除先出现的重复值
df['city'].drop_duplicates(keep='last')

1           sh
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object

In [20]:
#数据替换
df['city'].replace('sh', 'shanghai')

0      beijing
1     shanghai
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object