In [2]:
import numpy as np
import pandas as pd

In [3]:
x = np.random.randint(0,4,100)
y = np.random.randint(4,7,100)
df = pd.DataFrame({'x':x,'y':y})

In [1]:
from scipy import stats
import numpy as np
x = np.arange(-5, 5, 0.1)
y = stats.norm.cdf(x, 0, 1)
plt.plot(x, y)
import pandas as pd
#绘制目标数据（这里使用UCI机器学习数据库中的churn数据集）的累计分布函数图
churn_raw_data = pd.read_csv('churn.txt')
day_minute = churn_raw_data['Day Mins']
sorted_ = np.sort(day_minute)
yvals = np.arange(len(sorted_))/float(len(sorted_))
plt.plot(sorted_, yvals)
x_label = stats.norm.ppf(yvals)  #对目标累计分布函数值求标准正太分布累计分布函数的逆
plt.scatter(x_label, sorted_)
stats.probplot(day_minute, dist="norm", plot=plt)
plt.show()

In [2]:
import pandas as pd

# 第一步:构造DataFrame数据
time_stamps = ['2015-03-08 10:30:00.360000+00:00', '2017-07-13 15:45:05.755000-07:00',
               '2012-01-20 22:30:00.254000+05:30', '2016-12-25 00:30:00.000000+10:00']

# 第二步: 将time_stamps转换为DataFrame格式
time_pd = pd.DataFrame(time_stamps, columns=['Times'])

# 第三步: 使用pd.Timestamp 将字符串类型转换为日期格式
time_pd['stamp'] = [pd.Timestamp(time) for time in time_pd['Times'].values]
# print(time_pd[['stamp', 'Times']])
# 第五步： 提取与时刻有关的特征
time_pd['Hour'] = time_pd['stamp'].apply(lambda d: d.hour)
time_pd['Minute'] = time_pd['stamp'].apply(lambda d: d.minute)
time_pd['Second'] = time_pd['stamp'].apply(lambda d: d.second)
time_pd['MUsecond'] = time_pd['stamp'].apply(lambda d: d.microsecond)   #毫秒
time_pd['UTC_offset'] = time_pd['stamp'].apply(lambda d: d.utcoffset())

# 第六步：使用pd.cut将hour的数据进行切分，分成几个过程
cut_hour = [-1, 5, 11, 16, 21, 23]
cut_labels = ['last night', 'morning', 'afternoon', 'evening', 'Night']
time_pd['Hour_cut'] = pd.cut(time_pd['Hour'], bins=cut_hour, labels=cut_labels)
print(time_pd['Hour_cut'].head())
# 第七步：使用LabelEncoder对标签进行数值转换
from sklearn.preprocessing import LabelEncoder

La = LabelEncoder()
time_pd['Hour_number'] = La.fit_transform(time_pd['Hour_cut'])
label_dict = {classes: number for number, classes in enumerate(La.classes_)}
print(time_pd[['Hour_cut', 'Hour_number']])
print(label_dict)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from itertools import product

class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, 
                 target_type='classification', prior_weight_func=None):

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()

        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)

        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y

    def fit_transform(self, X, y):
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new


array([2, 2, 3, 4])

In [None]:
def DeivdedByZero(nominator, denominator):
    if denominator == 0:
        return 0
    else:
        return nominator*1.0/denominator
time_window = [7, 30, 60, 90, 120, 150, 180]
var_list = ['LogInfo1','LogInfo2']
data1GroupbyIdx = pd.DataFrame({
    'Idx':data1_train['Idx'].drop_duplicates()})

for tw in time_window:
    data1_train['TruncatedLogInfo'] = data1_train['Listinginfo'].map(
        lambda x: x + datetime.timedelta(-tw))
    temp = data1_train.loc[data1_train['logInfo'] >= data1_train['TruncatedLogInfo']]
    for var in var_list:
        #count the frequences of LogInfo1 and LogInfo2
        count_stats = temp.groupby(['Idx'])[var].count().to_dict()
        data1GroupbyIdx[str(var)+'_'+str(tw)+'_ct'] = 
        data1GroupbyIdx['Idx'].map(lambda x: 
                                        count_stats.get(x,0))

        # count the distinct value of LogInfo1 and LogInfo2
        Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates()
        uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count(
        ).to_dict()
        data1GroupbyIdx[str(var) + '_' + str(tw) + '_uq'] = 
        data1GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0))

        # calculate the average count of each value in LogInfo1 and LogInfo2
        data1GroupbyIdx[str(var) + '_' + str(tw) + '_avg_ct'] = 
        data1GroupbyIdx[[str(var)+'_'+str(tw)+'_count',
            str(var) + '_' + str(tw) + '_unique']].\
            apply(lambda x: DeivdedByZero(x[0],x[1]), axis=1)