# 数据质量分析
* 命令行运行时，切换工作目录

```
import os
os.chdir('./code')
sys.path.append(os.getcwd())

```

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import thinkstats2
import math
from scipy.stats import entropy
from scipy.stats import ks_2samp


def conditional_entropy(x, cond, flag='flag'):
    """
    :var x : Pandas.DataFrame
    """
    res = 0
    column_res = {}

    cond_column = x[cond].dropna()
    for v in cond_column.unique():
        part = x[x[cond]==v]
        part_count = part.shape[0]
        part_entropy = []
        for c in part.groupby(flag)[flag].count():
            part_entropy.append( (c-0.)/part_count)
        column_res[v] = entropy(part_entropy),part_count
        res += (part_count-0.) / cond_column.shape[0] * column_res[v][0]

    return res,column_res

In [None]:
data_at = pd.read_csv('../data/A_train.csv')
data_bt = pd.read_csv('../data/B_train.csv')
data_test = pd.read_csv('../data/B_test.csv')
target_columns = [] #目标维度

* 维度情况
    * 3组数据维度一样，维度之间可比较？

In [8]:
print data_bt.columns.shape
print data_bt.columns.intersection(data_at.columns).shape
print data_bt.columns.intersection(data_test.columns).shape

(491L,)
(491L,)
(490L,)


* 缺失值分析
    * https://blog.csdn.net/weixin_40159138/article/details/89421014
    * https://www.jianshu.com/p/9c867fb9cf17
    * https://scikit-learn.org/stable/modules/impute.html#impute
    * 从下图可以看到缺失值非常接近
    * B_test和A_train缺失值情况几乎一致
    * B_train的缺失值情况比A_train严重很多
    * A_train中20%的用户缺失维度在100个以内，40%的缺失维度在450个以上，60%用户的缺失维度在150个以内
    * B_train 和 B_test 38%的用户缺失维度在186左右，60%用户维度缺失在460以上
    * 缺失值在学习过程中，feature_importance 会降低，并不一定会影响学习效果，*可以对比是否填充缺失值对结果的影响*

In [None]:
fea_null = np.sum(data_at.isnull(), axis=0)
feb_null = np.sum(data_bt.isnull(), axis=0)
fet_null = np.sum(data_test.isnull(), axis=0)

plt.subplot(311).plot(fea_null.values)
plt.subplot(312).plot(feb_null.values)
plt.subplot(313).plot(fet_null.values)

# sort_values
plt.subplot(311).plot(np.sort(fea_null))
plt.subplot(312).plot(np.sort(feb_null))
plt.subplot(313).plot(np.sort(fet_null))
plt.show()

# 缺失值归一化
plt.plot(np.sort(fea_null/data_at.shape[0]), color='green')
plt.plot(np.sort(feb_null/data_bt.shape[0]), color='blue')
plt.plot(np.sort(fet_null/data_test.shape[0]),color='red')
plt.show()

u_fea_null = np.sum(data_at.isnull(), axis=1)
u_feb_null = np.sum(data_bt.isnull(), axis=1)
u_fet_null = np.sum(data_test.isnull(), axis=1)
u_fea_null.hist(cumulative=True, density=1, bins=100, alpha=.2, color="r")
u_feb_null.hist(cumulative=True, density=1, bins=100, alpha=.2, color="b")
u_fet_null.hist(cumulative=True, density=1, bins=100, alpha=.2, color="g")
plt.show()
print "用户的维度缺失情况"
print (u_fea_null[u_fea_null<156].count()-0.0)/u_fea_null.count()
print (u_feb_null[u_feb_null<186].count()-0.0)/u_feb_null.count()
print (u_fet_null[u_fet_null<186].count()-0.0)/u_fet_null.count()


* 连续与离散
    * 知识：https://blog.csdn.net/ztf312/article/details/53991329 https://blog.csdn.net/banbuduoyujian/article/details/53957653
    * https://scikit-learn.org/stable/auto_examples/preprocessing/plot_discretization.html https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.cut.html
    * 489维数据中，383维的取值在100个以内，推测大部分应该是离散值
    * 取值在100个以上的可能为连续值
    * 取值大于300的维度81个，大于500的69个，这些极可能是连续值
    * 需要对连续值进行离散化：分类

In [None]:
discrete_threshold = 120
data_tmp_all = pd.concat([data_at.sort_index(axis=1).drop(['flag','no'],axis=1),data_bt.sort_index(axis=1).drop(['flag','no'],axis=1)])
data_tmp_all = pd.concat([data_tmp_all, data_test.sort_index(axis=1).drop(['no'],axis=1)])
data_all_n_unique = data_tmp_all.nunique()
#plt.hist(data_all_n_unique, cumulative=False, bins=100)
#plt.show()
discrete_columns = data_all_n_unique[data_all_n_unique<=discrete_threshold].index
continuous_columns = data_all_n_unique[data_all_n_unique>discrete_threshold].index
null_columns_all = np.sum(data_tmp_all.isnull(), axis=0).sort_values(ascending=True)


* 异常值分析
    * https://www.cnblogs.com/tinglele527/p/11955103.html
    * https://scikit-learn.org/stable/modules/outlier_detection.html
    * https://scikit-learn.org/0.20/auto_examples/plot_anomaly_comparison.html
    * https://blog.csdn.net/PbGc396Dwxjb77F2je/article/details/99687952
    * 离散值中取值比例很小的这部分，可能有两种情况：对预测结果有强作用，对预测情况无影响，*可以做对比分析*
    * 离散值中，取值比例很小的部分，如果熵很大，说明本身对结果没有区分度，这部分异常值可能性很大，
    * 离散值中，条件熵大的维度，区分度小，这部分维度可能需要去除掉
    * 连续值中，box plot可以很方便观测处异常值
    * 连续值在A_train中异常值偏少，整体少于4%，是否需要处理？连续值在B_train中，整体异常值少于2.5%
    *

In [None]:
#标准差法 sunspots.counts > xbar + 2 * xstd
data_at_continuous = data_at[continuous_columns]
data_at_cont_abnormal = data_at_continuous > data_at_continuous.mean()+2*data_at_continuous.std()
data_at_cont_ab_proportion = np.sum(data_at_cont_abnormal, axis=0)/data_at_cont_abnormal.shape[0]
plt.hist(data_at_cont_ab_proportion, cumulative=True, bins=100, density=True);plt.show()

data_bt_continuous = data_bt[continuous_columns]
data_bt_cont_abnormal = data_bt_continuous > data_bt_continuous.mean()+2*data_bt_continuous.std()
data_bt_cont_ab_proportion = np.sum(data_bt_cont_abnormal, axis=0)/data_bt_cont_abnormal.shape[0]
plt.hist(data_bt_cont_ab_proportion, cumulative=True, bins=100, density=True);plt.show()

# 维度与熵
entropy_dis = {}
count_threshold = data_at.shape[0] * 0.05
entropy_threshold = 0.5
# 每个维度的异常熵
least_count_entropy = {}
for column in discrete_columns:
    s,d = conditional_entropy(data_at, cond=column)
    entropy_dis[column] = s
    for v in d:
        # 离散值中，如果某个值的数量很少，单独存起来
        if d[v][1]<=count_threshold and d[v][0]>=entropy_threshold:
            if column not in least_count_entropy:
                least_count_entropy[column] = {}
            least_count_entropy[column][v] = d[v]
entropy_dis = pd.DataFrame.from_dict(entropy_dis, orient='index')
plt.hist(np.sort(entropy_dis, axis=0), bins=100, cumulative=True);plt.show()

* 同分布检验
    * https://blog.csdn.net/qq_41679006/article/details/80977113
    * https://www.cnblogs.com/arkenstone/p/5496761.html
    * https://blog.csdn.net/t15600624671/article/details/78770239
    * B_test 和 B_train只有2维数据的分布差异较大，显著性α=0.05
    * B_train 和 A_train的数据差异较大：有199维数据的分布相差大，所以考虑剔除掉199维数据
    * 缺失值少，且同分布的维度 极有可能是最重要的维度，可以尝试只取这部分数据进行分析， *可以做对比分析*

In [None]:
dis_b_diff = {};dis_b_same={};dis_ab_diff={};dis_ab_same={}
for column in data_bt.columns.drop(['no','flag']):
    print column
    d, p = ks_2samp(data_bt[column].dropna(), data_test[column].dropna())
    if p<=0.05:
        dis_b_diff[column] = (d,p)
    else:
        dis_b_same[column] = (d,p)
    d, p = ks_2samp(data_bt[column].dropna(), data_at[column].dropna())
    if p<=0.05:
        dis_ab_diff[column] = (d,p)
    else:
        dis_ab_same[column] = (d,p)
dis_b_diff = pd.DataFrame.from_dict(dis_b_diff,orient='index')
dis_b_same = pd.DataFrame.from_dict(dis_b_same,orient='index')
dis_ab_diff = pd.DataFrame.from_dict(dis_ab_diff,orient='index')
dis_ab_same = pd.DataFrame.from_dict(dis_ab_same,orient='index')
# 缺失值少，且同分布的维度
null_num = 300
same_features = feb_null.sort_values(ascending=False).tail(null_num).index.intersection(dis_ab_same.index)
same_features = same_features.intersection(fea_null.sort_values(ascending=False).tail(null_num).index)

* 数据规范化
    * https://blog.csdn.net/weixin_38706928/article/details/80329563
    * https://scikit-learn.org/stable/modules/preprocessing.html

* 经过以上分析，可以分布验证数据的处理情况
    * 排除null值多的维度
    * 取A B同分布维度，B_test + B_train = B
    * 剔除线性相关性强的维度
    * 数据离散化：sklearn
    * 排除条件熵大的维度
    * 填充null值：固定填充，根据分布填充
    * 数据规范化


