# 数据预处理

### 数据无量纲化

线性无量纲化包括中心化(zero-centered)和缩放处理(scale)

##### 数据归一化(Normalization)(Min-Max-Scaling)

归一化后的数据服从正态分布。公式：
$$x^{*} = \frac{x-min(x)}{max(x)-min(x
)}$$

In [1]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1,18]]

In [2]:
import pandas as pd
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [3]:
# 实现归一化
scaler = MinMaxScaler()
scaler = scaler.fit(data)
result = scaler.transform(data)
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [4]:
result_ = scaler.fit_transform(data)
scaler.inverse_transform(result) # 归一化的结果逆转

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [5]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1,18]]
scaler = MinMaxScaler(feature_range=(5, 10))
result = scaler.fit_transform(data)
result
 
# 当x中的特征数量非常多的时候，fit会报错
# 此时使用partial_fit作为训练接口

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

In [6]:
# 使用numpy实现归一化
import numpy as np
X = np.array([[-1, 2], [-0.5, 6], [0, 10], [1,18]])
X_nor = (X - X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))
X_nor

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [7]:
# 逆转归一化
X_return = X_nor * (X.max(axis = 0) - X.min(axis = 0)) + X.min(axis = 0)
X_return

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

##### 数据标准化(Standradization)(Z-score-normalization)

$$x^{*}=\frac{x-\mu}{\sigma}$$

In [8]:
from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1,18]]

In [9]:
scaler = StandardScaler()
scaler.fit(data)

In [10]:
scaler.mean_

array([-0.125,  9.   ])

In [11]:
scaler.var_

array([ 0.546875, 35.      ])

In [12]:
x_std = scaler.transform(data)
x_std

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [13]:
x_std.mean()

0.0

In [14]:
x_std.std()

1.0

In [15]:
scaler.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [16]:
scaler.inverse_transform(x_std)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

##### 如何选择归一化/标准化

大多数选择标准化  
归一化对异常值非常敏感  
先选择标准化，效果不好再归一化

### 处理缺失值

In [17]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes

# 1. 加载 diabetes 数据集
diabetes = load_diabetes(as_frame=True) # 使用 as_frame=True 直接加载为 Pandas DataFrame

# diabetes.data 包含特征
# diabetes.target 包含目标变量

df = diabetes.frame # 获取包含特征和目标变量的DataFrame

print("原始数据集的前5行：")
print(df.head())
print("\n原始数据集的信息：")
df.info()

# 2. 随机引入缺失值
# 我们选择其中的几个列，并在其中随机插入NaN值
# 这里以 'bmi' 和 'bp' 列为例，随机删除10%的数据
np.random.seed(42) # 设置随机种子以保证结果可复现

# 获取需要添加缺失值的列
cols_to_corrupt = ['bmi', 'bp', 's1']

for col in cols_to_corrupt:
    # 计算需要删除的行数 (例如，10% 的数据)
    num_missing = int(len(df) * 0.4)
    
    # 随机选择索引
    missing_indices = np.random.choice(df.index, num_missing, replace=False)
    
    # 将选定索引处的数据设置为NaN
    df.loc[missing_indices, col] = np.nan

print("\n----------------------------------------------------------")
print("引入缺失值后的数据集信息：")
df.info()

print("\n引入缺失值后的数据集缺失值统计：")
print(df.isnull().sum())

print("\n引入缺失值后的数据集（部分）：")
# 打印几行包含NaN的数据，方便观察
print(df[df.isnull().any(axis=1)].head())

# 现在，'df' 就是一个包含缺失值的数据集，您可以开始学习和实践缺失值处理技术了！
# 例如，可以使用 df.dropna() 删除缺失值行，或者使用 df.fillna() 进行填充。


原始数据集的前5行：
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  

原始数据集的信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-n

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     266 non-null    float64
 3   bp      266 non-null    float64
 4   s1      266 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [19]:
from sklearn.impute import SimpleImputer

In [20]:
bmi = df.loc[:, 'bmi'].values.reshape(-1, 1)

In [21]:
bmi.shape

(442, 1)

In [22]:
imp_mean = SimpleImputer() #默认
imp_median = SimpleImputer(strategy="median")
imp_0 = SimpleImputer(strategy='constant', fill_value=0)

In [23]:
imp_mean = imp_mean.fit_transform(bmi)
imp_median = imp_median.fit_transform(bmi)
imp_0 = imp_0.fit_transform(bmi)

In [24]:
imp_mean[:20]

array([[ 0.00282984],
       [-0.05147406],
       [ 0.04445121],
       [ 0.00282984],
       [-0.03638469],
       [ 0.00282984],
       [-0.04716281],
       [ 0.00282984],
       [ 0.06169621],
       [ 0.00282984],
       [-0.08380842],
       [ 0.00282984],
       [-0.02884001],
       [-0.00189471],
       [-0.02560657],
       [ 0.00282984],
       [ 0.00282984],
       [ 0.00282984],
       [ 0.00282984],
       [ 0.00282984]])

In [25]:
imp_median[:20]

array([[-0.00458924],
       [-0.05147406],
       [ 0.04445121],
       [-0.00458924],
       [-0.03638469],
       [-0.00458924],
       [-0.04716281],
       [-0.00458924],
       [ 0.06169621],
       [-0.00458924],
       [-0.08380842],
       [-0.00458924],
       [-0.02884001],
       [-0.00189471],
       [-0.02560657],
       [-0.00458924],
       [-0.00458924],
       [-0.00458924],
       [-0.00458924],
       [-0.00458924]])

In [26]:
imp_0[:20]

array([[ 0.        ],
       [-0.05147406],
       [ 0.04445121],
       [ 0.        ],
       [-0.03638469],
       [ 0.        ],
       [-0.04716281],
       [ 0.        ],
       [ 0.06169621],
       [ 0.        ],
       [-0.08380842],
       [ 0.        ],
       [-0.02884001],
       [-0.00189471],
       [-0.02560657],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ]])

In [27]:
df.loc[:, 'bmi'] = imp_mean

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      266 non-null    float64
 4   s1      266 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [29]:
s1 = df.loc[:, 's1'].values.reshape(-1, 1)
# 众数可以填字符型
imp_mode = SimpleImputer(strategy='most_frequent')
df.loc[:, 's1'] = imp_mode.fit_transform(s1)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      266 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [31]:
data = diabetes.frame

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      266 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [33]:
# 可以用fillna命令
# dropna(axis = 0, inplace = True) 删除有缺失值的行并直接作用于原数据集

### 处理分类型变量

将文字型数据转换为数值型

In [34]:
from sklearn.preprocessing import LabelEncoder
import seaborn as sns # 用于加载数据集

# 1. 加载泰坦尼克号数据集
data = sns.load_dataset('titanic')

print("数据集的前5行：")
print(titanic_df.head())

print("\n数据集的信息（包括数据类型）：")
titanic_df.info()

print("\n数据集中分类变量的统计：")
# 找出对象类型（通常是分类变量）或具有有限唯一值的变量
categorical_cols = [col for col in titanic_df.columns if titanic_df[col].dtype == 'object' or titanic_df[col].nunique() < 10] # 假设唯一值少于10的数值列也可能是分类的
print(categorical_cols)

# 打印一些分类变量的唯一值及其计数
for col in categorical_cols:
    if col in titanic_df.columns: # 确保列存在
        print(f"\n列 '{col}' 的唯一值及其计数：")
        print(titanic_df[col].value_counts())
        if titanic_df[col].isnull().any():
            print(f"列 '{col}' 的缺失值数量：{titanic_df[col].isnull().sum()}")

# 移除一些不常用的列（如PassengerId, Name, Ticket, Cabin，它们通常需要更复杂的特征工程）
# 为了简化分类变量处理，我们关注更直接的分类特征
# titanic_df = titanic_df.drop(['passengerid', 'name', 'ticket', 'cabin'], axis=1) # 如果你导入的是完整的原始数据集



数据集的前5行：


NameError: name 'titanic_df' is not defined

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [None]:
y = data.iloc[:, -2] # 允许一维

In [None]:
le = LabelEncoder()
le = le.fit(y)
le.classes_

array(['no', 'yes'], dtype=object)

In [None]:
label = le.transform(y)

In [None]:
label

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [None]:
data.iloc[:, -2] = label

In [None]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,0,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,1,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,1,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,1,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,0,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,1,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,0,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,1,True


In [None]:
from sklearn.preprocessing import OrdinalEncoder # 专门给特征编码，不允许一维
data_ = data.copy()
data_

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,0,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,1,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,1,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,1,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,0,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,1,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,0,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,1,True


In [None]:
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

[array([1, 2, 3], dtype=int64),
 array(['female', 'male'], dtype=object),
 array([ 0.42,  0.67,  0.75,  0.83,  0.92,  1.  ,  2.  ,  3.  ,  4.  ,
         5.  ,  6.  ,  7.  ,  8.  ,  9.  , 10.  , 11.  , 12.  , 13.  ,
        14.  , 14.5 , 15.  , 16.  , 17.  , 18.  , 19.  , 20.  , 20.5 ,
        21.  , 22.  , 23.  , 23.5 , 24.  , 24.5 , 25.  , 26.  , 27.  ,
        28.  , 28.5 , 29.  , 30.  , 30.5 , 31.  , 32.  , 32.5 , 33.  ,
        34.  , 34.5 , 35.  , 36.  , 36.5 , 37.  , 38.  , 39.  , 40.  ,
        40.5 , 41.  , 42.  , 43.  , 44.  , 45.  , 45.5 , 46.  , 47.  ,
        48.  , 49.  , 50.  , 51.  , 52.  , 53.  , 54.  , 55.  , 55.5 ,
        56.  , 57.  , 58.  , 59.  , 60.  , 61.  , 62.  , 63.  , 64.  ,
        65.  , 66.  , 70.  , 70.5 , 71.  , 74.  , 80.  ,   nan]),
 array([0, 1, 2, 3, 4, 5, 8], dtype=int64),
 array([0, 1, 2, 3, 4, 5, 6], dtype=int64),
 array([  0.    ,   4.0125,   5.    ,   6.2375,   6.4375,   6.45  ,
          6.4958,   6.75  ,   6.8583,   6.95  ,   6.975 ,   7.045

In [None]:
data_.iloc[:, [2,7]] = OrdinalEncoder().fit_transform(data_.iloc[:, [2, 7]])


In [None]:
data_

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1.0,22.0,1,0,7.2500,2.0,Third,man,True,,Southampton,0,False
1,1,1,0.0,38.0,1,0,71.2833,0.0,First,woman,False,C,Cherbourg,1,False
2,1,3,0.0,26.0,0,0,7.9250,2.0,Third,woman,False,,Southampton,1,True
3,1,1,0.0,35.0,1,0,53.1000,2.0,First,woman,False,C,Southampton,1,False
4,0,3,1.0,35.0,0,0,8.0500,2.0,Third,man,True,,Southampton,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1.0,27.0,0,0,13.0000,2.0,Second,man,True,,Southampton,0,True
887,1,1,0.0,19.0,0,0,30.0000,2.0,First,woman,False,B,Southampton,1,True
888,0,3,0.0,,1,2,23.4500,2.0,Third,woman,False,,Southampton,0,False
889,1,1,1.0,26.0,0,0,30.0000,0.0,First,man,True,C,Cherbourg,1,True


In [None]:
# 独热编码
'''
三种分类数据：
1.舱门：S C Q
相互独立，彼此之间完全没有联系，仅仅表示S!=C!=Q
名义变量
2.学历：小学 初中 高中
有大小关系，并不完全独立，但是不可计算
有序变量
3.体重：45 55 65
有联系，可以计算，分类间可以通过数学计算互相转换
有距变量
'''

'''
OrdinalEncoder可以处理有序变量，但对于名义变量，我们要使用独热编码
相当于生成了一个单位矩阵，说明这个特征的几个标签仅仅是不能同时存在而已
'''

'\n三种分类数据：\n'

In [None]:
# 性别和舱门都是独热编码
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:, [2, 7]]

In [None]:
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
result

array([[0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0.]])

In [None]:
# 还原
pd.DataFrame(enc.inverse_transform(result))

Unnamed: 0,0,1
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [None]:
enc.get_feature_names_out()


array(['sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S',
       'embarked_nan'], dtype=object)

In [None]:
result


array([[0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0.]])

In [None]:
result.shape

(891, 6)

In [None]:
newdata = pd.concat([data, pd.DataFrame(result)], axis = 1)

In [None]:
newdata

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,0,1,2,3,4,5
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,0,False,0.0,1.0,0.0,0.0,1.0,0.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,1,False,1.0,0.0,1.0,0.0,0.0,0.0
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,1,True,1.0,0.0,0.0,0.0,1.0,0.0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,1,False,1.0,0.0,0.0,0.0,1.0,0.0
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,0,True,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,0,True,0.0,1.0,0.0,0.0,1.0,0.0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,1,True,1.0,0.0,0.0,0.0,1.0,0.0
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,0,False,1.0,0.0,0.0,0.0,1.0,0.0
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,1,True,0.0,1.0,1.0,0.0,0.0,0.0


In [None]:
newdata.drop(["sex", "embarked"], axis = 1, inplace = True)

In [None]:
newdata

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alive,alone,0,1,2,3,4,5
0,0,3,22.0,1,0,7.2500,Third,man,True,,Southampton,0,False,0.0,1.0,0.0,0.0,1.0,0.0
1,1,1,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,1,False,1.0,0.0,1.0,0.0,0.0,0.0
2,1,3,26.0,0,0,7.9250,Third,woman,False,,Southampton,1,True,1.0,0.0,0.0,0.0,1.0,0.0
3,1,1,35.0,1,0,53.1000,First,woman,False,C,Southampton,1,False,1.0,0.0,0.0,0.0,1.0,0.0
4,0,3,35.0,0,0,8.0500,Third,man,True,,Southampton,0,True,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,Second,man,True,,Southampton,0,True,0.0,1.0,0.0,0.0,1.0,0.0
887,1,1,19.0,0,0,30.0000,First,woman,False,B,Southampton,1,True,1.0,0.0,0.0,0.0,1.0,0.0
888,0,3,,1,2,23.4500,Third,woman,False,,Southampton,0,False,1.0,0.0,0.0,0.0,1.0,0.0
889,1,1,26.0,0,0,30.0000,First,man,True,C,Cherbourg,1,True,0.0,1.0,1.0,0.0,0.0,0.0


In [None]:
newdata.columns =[....]

### 处理连续型变量

#### 二值化

设置一个阈值将数据二值化

In [None]:
data2 = data.copy().dropna()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [44]:
data2

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [52]:
# 对年龄二值化
from sklearn.preprocessing import Binarizer
# 先填补缺失值
age = data2.loc[:, 'age'].values.reshape(-1, 1)
imp_mean = SimpleImputer().fit_transform(age)
data2.loc[:, 'age'] = imp_mean

In [53]:
data2

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [54]:
transformer = Binarizer(threshold=38).fit_transform(X)
transformer

array([[0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],

#### 分箱处理

将连续型变量划分为分类变量的类

In [49]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
'''
参数：
n_bins：分箱的个数，默认5
encode：编码的方式 默认onehot
        onehot：做哑变量，之后返回一个稀疏矩阵，一列是一个特征的一个类别，含有这个类别的样本为1，不含的为0
        ordinal：每个特征的每个箱都被编码为一个整数，返回的一列是一个特征，每个特征下有不同整数编码的箱
strate：定义箱宽，默认quantile
        uniform：等宽分箱，按照样本取值分，最常用
        quantile：等位分箱，按照样本数量
        kmeans：聚类分箱，每个箱中的值到最近的一维k均值聚类的簇心的距离相同
'''

'\n参数：\nn_bins：分箱的个数，默认5\nencode：编码的方式 默认onehot\n        onehot：做哑变量，之后返回一个稀疏矩阵，一列是一个特征的一个类别，含有这个类别的样本为1，不含的为0\n        ordinal：每个特征的每个箱都被编码为一个整数，返回的一列是一个特征，每个特征下有不同整数编码的箱\nstrate：定义箱宽，默认quantile\n        uniform：等宽分箱，按照样本取值分\n        quantile：等位分箱，按照样本数量\n        kmeans：聚类分箱，每个箱中的值到最近的一维k均值聚类的簇心的距离相同\n'

In [60]:
X = data2.iloc[:, 3].values.reshape(-1, 1)

In [61]:
est= KBinsDiscretizer(n_bins = 3, encode='ordinal',strategy='uniform')
est.fit_transform(X)

array([[1.],
       [1.],
       [2.],
       [0.],
       [2.],
       [1.],
       [1.],
       [0.],
       [1.],
       [2.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [2.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [2.],
       [2.],
       [1.],
       [0.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [2.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [2.],
       [0.],
       [0.],
       [0.],

In [None]:
# 查看转换后的分的箱
set(est.fit_transform(X).ravel())

{0.0, 1.0, 2.0}

In [63]:
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')

In [65]:
est.fit_transform(X).toarray()

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0