In [4]:
%%writefile Pretreat_Tools.py

import sys
import os 
module_path = os.path.abspath(os.path.join('..')) 
if module_path not in sys.path: 
    sys.path.append(module_path)
    
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import scipy.stats as st
import statsmodels.api as sm
import tools.Sample_Tools as smpl

# import cpuinfo
# if 'ntel' in cpuinfo.get_cpu_info()['brand_raw']:
# from sklearnex import patch_sklearn, unpatch_sklearn
# unpatch_sklearn() ##注意，少量数据的线性回归没有优势。慎用，存在内存泄露

from sklearn import linear_model

def neutralize(factor:pd.Series, data, categorical:list=None, logarithmetics:list=None):
    '''中性化：
        :param categorical：{list} --指明需要被dummy的列
        :param logarithmetics：{list}  --指明要对对数化的列
        注：被categorical的column的value必须是字符串。
        注：一般来说，顺序是 去极值->中性化->标准化
        注：单截面操作
    '''
    if factor.index.is_monotonic_increasing == False or data.index.is_monotonic_increasing == False:
        import warnings
        warnings.warn('factor or data should be sorted, 否则有可能会造成会自变量和因变量匹配错误',UserWarning)
        
    X = data.copy()
    # 对数化
    if not logarithmetics is None:
        X[logarithmetics] = X[logarithmetics].agg('log')
    # 哑变量
    if not categorical is None:
        X = pd.get_dummies(X,columns=categorical)
        
#     print(X)
        
    model = linear_model.LinearRegression(fit_intercept=False).fit(X, factor)
    neutralize_factor = factor - model.predict(X)

    return neutralize_factor

    

def winsorize_by_quantile(obj, floor=0.025, upper=0.975, column=None, drop=True):
    """
       根据分位上下限选取数据
       :param obj:{pd.DataFrame | pd.Series} 
       :param column:{str} --当obj为DataFrame时，用来指明处理的列。
       :param drop:{bool} --分位外的数据处理方式，
                            True：删除整（行）条数据；
                            False：用临界值替换范围外的值
    """
    if isinstance(obj, pd.Series):
        qt = obj.quantile([floor,upper])
        if drop:
            return obj[(obj>=qt[floor]) & (obj<=qt[upper])]
        else:
            obj[obj < qt[floor]] = qt[floor]
            obj[obj > qt[upper]] = qt[upper]
            return obj
    
    if isinstance(obj, pd.DataFrame):
        assert column, 'COLUMN CANT be NONE when obj is dataframe'
        qt = obj[column].quantile([floor,upper])
        if drop:
            return obj[(obj[column]>=qt[floor]) & (obj[column]<=qt[upper])]
        else:
            obj.loc[obj[column] < qt[floor], column] = qt[floor]
            obj.loc[obj[column] > qt[upper], column] = qt[upper]
            return obj
    raise TypeError('obj must be series or dataframe')
    
def winsorize_by_mad(obj, n=3, column=None, drop=True):
    """
       根据中位数偏离倍数选取数据
       :param obj:{pd.DataFrame | pd.Series} 
       :param n:{pd.DataFrame | pd.Series} --偏离倍数
       :param column:{str} --当obj为DataFrame时，用来指明处理的列。
       :param drop:{bool} --分位外的数据处理方式，
                            True：删除整（行）条数据；
                            False：用临界值替换范围外的值
    """
    
    if isinstance(obj, pd.Series):
        median = np.median(obj.dropna())
        mad = np.median((obj.dropna() - median).abs())
        #样本标准差的估计量(σ≈1.483)
        mad_e = 1.483*mad
        upper = median + n*mad_e
        floor = median - n*mad_e
        if drop:
            return obj[(obj>=floor) & (obj<=upper) | obj.isna()]
        else:
            obj[obj < floor] = floor
            obj[obj > upper] = upper
            return obj
    
    if isinstance(obj, pd.DataFrame):
        assert column, 'COLUMN CANT be NONE when obj is dataframe'
        median = np.median(obj[column].dropna())
        mad = np.median((obj.dropna() - median).abs())
        mad_e = 1.483*mad
        upper = median + n*mad_e
        floor = median - n*mad_e
        if drop:
            return obj[(obj[column]>=floor) & (obj[column]<=upper) | obj[column].isna()]
        else:
            obj.loc[obj[column] < floor, column] = floor
            obj.loc[obj[column] > upper, column] = upper
            return obj
    
    raise TypeError('obj must be series or dataframe')

# 标准化
def standardize(data, multi_code=False):
    if multi_code:
        return data.groupby(level=1, group_keys=False).apply(lambda x: standardize(x,multi_code=False))
    else:
        return (data - data.mean())/data.std()

def binning(df, deal_column:str,box_count:int, labels=None, inplace=True):
    """
       分箱，为df增加名为"group_label"的列作为分组标签。
       :param df:{pd.DataFrame} 
       :param deal_column:{str} --要处理的列名,
       :param box_count:{int} --分几组,
       :param labels:{list} --分组的标签名，默认是分组序号（default:None）
                              默认情况下，生成的标签是反序的，既最小的值在最后的组
       :param inplace:{bool} --是否在原对象上修改,建议用true，效率高（default:True）
       :return: {pd.DataFame}
    """
    assert isinstance(df, pd.DataFrame), 'df必须为dataframe'
    if not labels is None:
        assert len(labels)==box_count, 'labels的数量必须与分箱数相等'
        labels_= labels
    else:
        labels_= np.array(range(box_count))+1
        labels_ = labels_[::-1]
    
    vals = df[deal_column]
    val_set = vals.unique()
    reality_count = len(val_set)
    
    if inplace:
        if box_count > reality_count:
            # 可能由于大量0或者nan，导致分类的数量少于分箱数量。 直接当任务失败，返回空值
            df['group_label'] = None
            return df
        else:
            vals = df[deal_column]
            val_set = vals.unique()
            bins = pd.qcut(val_set, box_count, labels=labels_, retbins=False,)
            val_bin_dic = {key:bin_val for key,bin_val in zip(val_set,bins)}
            res = list(map(lambda x: val_bin_dic[x], vals))
            
            df['group_label'] = res
            return df
    else:
        if box_count > reality_count:
            # 可能由于大量0或者nan，导致分类的数量少于分箱数量。 直接当任务失败，返回空值
            return df.assign(group_label=None)
        else:
            bins = pd.qcut(val_set, box_count, labels=labels_, retbins=False,)
            val_bin_dic = {key:bin_val for key,bin_val in zip(val_set,bins)}
            res = list(map(lambda x: val_bin_dic[x], vals))
            return df.assign(group_label=res)



Overwriting Pretreat_Tools.py


In [100]:
import QUANTAXIS as QA
import base.JuUnits as u
from QUANTAXIS.QAUtil.QAParameter import MARKET_TYPE, RUNNING_ENVIRONMENT, ORDER_DIRECTION
data = QA.QA_quotation(['000001','000002', '000004', '000005', '000006'], '2010-12-05', '2020-12-31', source=QA.DATASOURCE.MONGO,
                               frequence='day', market=MARKET_TYPE.STOCK_CN, 
                               output=QA.OUTPUT_FORMAT.DATASTRUCT)
data.data

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,amount
date,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-12-06,000001,16.60,16.88,16.50,16.74,217923.0,3.642054e+08
2010-12-06,000002,8.51,8.69,8.45,8.49,982345.0,8.443094e+08
2010-12-06,000004,12.57,12.57,12.13,12.32,7599.0,9.389200e+06
2010-12-06,000005,3.89,3.93,3.87,3.89,63534.0,2.476463e+07
2010-12-06,000006,7.96,8.37,7.80,8.25,511592.0,4.185568e+08
...,...,...,...,...,...,...,...
2020-12-31,000001,19.21,19.58,19.02,19.34,924503.0,1.781736e+09
2020-12-31,000002,28.29,28.80,28.24,28.70,651991.0,1.862538e+09
2020-12-31,000004,20.69,21.09,20.31,20.70,22304.0,4.619703e+07
2020-12-31,000005,2.50,2.53,2.50,2.53,67806.0,1.701545e+07


In [206]:
a = pd.DataFrame({'fa':[1,2,3,4,5,6,7,8,9,10],'ma':[10000,120000,30000,640000,110000,240000,500000,8,9,10],'hy':["5","5","1","2","3","4","5",8,9,10]}, index=["a","b","c","d","e","f","g",'h','i','j'])
# dummy = sm.categorical(a.hy, drop=True)
# pd.get_dummies(a,['hy'])

# a = pd.get_dummies(a,['hy'])
# y = a['fa']
# X = a.iloc[:,1:]
# model = sm.OLS(y,X)
# results = model.fit()
# y_fitted = results.fittedvalues
# y_ = y - y_fitted
# y_
# a

#neutralize(a['fa'], a.iloc[:,1:],categorical=['hy'],logarithmetics=['ma'])
# a['fa'].quantile(.5)




Unnamed: 0,fa,ma,hy,group
a,1,10000,5,a
b,2,120000,5,a
c,3,30000,1,a
d,4,640000,2,b
e,5,110000,3,b
f,6,240000,4,c
g,7,500000,5,c
h,8,8,8,d
i,9,9,9,d
j,10,10,10,d


In [None]:
a['fa'].quantile

In [120]:
list(map(lambda x:weight_half_life(x,9), np.linspace(1, 20, 20)))

[0.92587471,
 0.85724398,
 0.79370053,
 0.73486725,
 0.680395,
 0.62996052,
 0.58326452,
 0.54002987,
 0.5,
 0.46293736,
 0.42862199,
 0.39685026,
 0.36743362,
 0.3401975,
 0.31498026,
 0.29163226,
 0.27001493,
 0.25,
 0.23146868,
 0.214311]

In [144]:
##############l拉格朗日
 
#目标函数：
def func(args):
    fun = lambda x: 60 - 10*x[0] - 4*x[1] + x[0]**2 + x[1]**2 - x[0]*x[1]
    #fun = lambda x: 10 - x[0]**2 - x[1]**2
    return fun
 
#约束条件，包括等式约束和不等式约束
def con(args):
    cons = [{'type': 'eq', 'fun': lambda x: x[0]+x[1]-8}]
    #cons = [{'type': 'ineq', 'fun': lambda x: x[1]-x[0]**2},
    #        {'type': 'eq', 'fun': lambda x: x[0]+x[1]}]
    return cons 
 

args = ()
args1 = ()
cons = con(args1)
x0 = np.array((2.0, 1.0))  #设置初始值，初始值的设置很重要，很容易收敛到另外的极值点中，建议多试几个值

#求解#
res = minimize(func(args), x0, method='SLSQP', constraints=cons)
print(res.success)
print("x1=",res.x[0],";  x2=",res.x[1])
print("最优解为：",res.fun)


True
x1= 4.999999943481969 ;  x2= 3.000000056518032
最优解为： 17.000000000000007
