In [1]:
import pandas as pd 

full_data_info = pd.read_csv('2000零件.csv',encoding ='utf-8')

In [2]:
full_data_info['日期'] = pd.to_datetime(full_data_info['日期'], format='%Y%m%d').dt.strftime('%Y-%m-%d')

In [3]:
# 零件号，仓库，根据20200101-202112月31号订单计算阈值，单条，多条，20230101号拦截订单数量，批次号
# 零件号，仓库，根据20200102-202301月01号订单计算阈值，单条，多条，20230102号拦截订单数量，批次号
# 零件号，仓库，根据20200103-202301月02号订单计算阈值，单条，多条，20230103号拦截订单数量，批次号

# 批次号，订单号，订单数量，零件号，仓库，经销商，上次订单间隔

In [4]:
import numpy as np

def find_outliers_onehot(data):  
    
    if len(data)==1 or len(set(data))==1:
        one_hot_num = float(data.head(1))
        if one_hot_num<=2000:
            thr_sig_num =  one_hot_num*2.5
        else:
            thr_sig_num = one_hot_num*1.5
    else:
        thr_sig_num  = np.nan 
    return thr_sig_num

def find_outliers_3sigma(data):  
    
    if len(data)==1 or len(set(data))==1:
        thr_sig_num = np.nan
    else:
        mean = sum(data) / len(data)  
        std_dev = (sum((x - mean) ** 2 for x in data) / len(data)) ** 0.5  
        thr_sig_num  = mean + 3 * std_dev 
    return thr_sig_num

def find_outliers_xxt(x):  
    
    if len(x)==1 or len(set(x))==1:
        thr_sig_num = np.nan
    else:
        q3 = lambda x: x.quantile(0.75)  
        q1 = lambda x: x.quantile(0.25)  
        thr_sig_num = q3(x) + 1.5 * (q3(x) - q1(x))  
    
    return thr_sig_num

def data_proces_info(df,type_info):
    df.columns = ['零件号','仓库','type',type_info+'_counts']
    df = df[df['type']==0]
    return df.drop(columns= ['type'])

In [5]:
def get_outlier_sys_info(full_data_info,start_time,end_time,test_time):
    
    data_train = full_data_info[(full_data_info['日期']<end_time)&(full_data_info['日期']>=start_time)]
    
    data_test = full_data_info[(full_data_info['日期']<=test_time)&(full_data_info['日期']>=end_time)]

    thr_sigma_info_onehot = data_train.groupby(['零件号' ,'仓库'])['需求数量'].apply(find_outliers_onehot).reset_index()
    thr_sigma_info_onehot = thr_sigma_info_onehot.rename(columns={'需求数量': 'onehot_outlier_values'}) 

    thr_sigma_info_3sigma = data_train.groupby(['零件号' ,'仓库'])['需求数量'].apply(find_outliers_3sigma).reset_index()
    thr_sigma_info_3sigma = thr_sigma_info_3sigma.rename(columns={'需求数量': '3sigma_outlier_values'})  

    thr_sigma_info_xxt = data_train.groupby(['零件号' ,'仓库'])['需求数量'].apply(find_outliers_xxt).reset_index()
    thr_sigma_info_xxt = thr_sigma_info_xxt.rename(columns={'需求数量': 'xxt_outlier_values'})  


    merged_df_onehot_sigma = pd.merge(thr_sigma_info_onehot, thr_sigma_info_3sigma,
                                  on=['零件号' ,'仓库'],how ='left')
    merged_df_full = pd.merge(merged_df_onehot_sigma, thr_sigma_info_xxt,
                                      on=['零件号' ,'仓库'],how ='left')



    data_test_outlier_values = pd.merge(data_test, merged_df_full,
                                      on=['零件号' ,'仓库'],how ='left')
    
    data_test_outlier_onehot = data_test_outlier_values.dropna(subset='onehot_outlier_values')
    data_test_outlier_onehot = data_test_outlier_values.dropna(subset='onehot_outlier_values')
    data_test_outlier_notonehot = data_test_outlier_values.dropna(subset='3sigma_outlier_values')

    data_test_outlier_notonehot['箱型图阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
                                                     <= data_test_outlier_notonehot['xxt_outlier_values'], 1, 0)
    data_test_outlier_notonehot['3sigma阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
                                                     <= data_test_outlier_notonehot['3sigma_outlier_values'], 1, 0)
    data_test_outlier_onehot['onehot拦截情况'] = np.where(data_test_outlier_onehot['需求数量']
                                                 <= data_test_outlier_onehot['onehot_outlier_values'], 1, 0) 

    
    output_3sigma = data_test_outlier_notonehot.groupby(['零件号','仓库','3sigma阈值拦截情况']).size().reset_index().sort_values(by=['零件号','仓库'])
                                                                       
    output_xxt = data_test_outlier_notonehot.groupby(['零件号','仓库','箱型图阈值拦截情况']).size().reset_index().sort_values(by=['零件号','仓库'])

    output_onehot = data_test_outlier_onehot.groupby(['零件号','仓库','onehot拦截情况']).size().reset_index().sort_values(by=['零件号','仓库'])

    

    output_onehot_new=data_proces_info(output_onehot,'one_hot')
    output_3sigma_new=data_proces_info(output_3sigma,'3sigma')
    output_xxt_new=data_proces_info(output_xxt,'xxt')

    df_list = [output_onehot_new,output_3sigma_new,output_xxt_new]  
  
    # 创建一个空的DataFrame，用作合并后的结果  
  
    # 循环遍历每个DataFrame，将其与结果DataFrame进行合并  
    for df in df_list:  
        merged_df_full = merged_df_full.merge(df, on=['零件号','仓库'],how ='left')
        
    merged_df_full['批次'] = end_time.replace('-','')
    
    merged_df_full.columns = ['零件号', '仓库', '单记录阈值', '3sigma阈值',
       '箱形图阈值', '单记录拦截订单数量', '3sigma拦截订单数量', '箱形图拦截订单数量','批次']
    return merged_df_full
    
    

In [6]:
merged_df_full_01 = get_outlier_sys_info(full_data_info,'2022-01-01','2023-01-01','2023-01-31')
merged_df_full_02 = get_outlier_sys_info(full_data_info,'2022-02-01','2023-02-01','2023-02-28')
merged_df_full_03 = get_outlier_sys_info(full_data_info,'2022-03-01','2023-03-01','2023-03-31')
merged_df_full_04 = get_outlier_sys_info(full_data_info,'2022-04-01','2023-04-01','2023-04-30')
merged_df_full_05 = get_outlier_sys_info(full_data_info,'2022-05-01','2023-05-01','2023-05-31')
merged_df_full_06 = get_outlier_sys_info(full_data_info,'2022-06-01','2023-06-01','2023-06-30')


frames = [merged_df_full_01, merged_df_full_02, merged_df_full_03
         ,merged_df_full_04,merged_df_full_05,merged_df_full_06]  
output = pd.concat(frames) 
output.sort_values(by=['零件号',
                       '仓库','批次'],inplace =True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_outlier_notonehot['箱型图阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_outlier_notonehot['3sigma阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_outlier_onehot['

In [9]:
start_time = '2022-01-01'
end_time  = '2023-01-01'
test_time = '2023-01-31'


data_train = full_data_info[(full_data_info['日期']<end_time)&(full_data_info['日期']>=start_time)]
    
data_test = full_data_info[(full_data_info['日期']<=test_time)&(full_data_info['日期']>=end_time)]

thr_sigma_info_onehot = data_train.groupby(['零件号' ,'仓库'])['需求数量'].apply(find_outliers_onehot).reset_index()
thr_sigma_info_onehot = thr_sigma_info_onehot.rename(columns={'需求数量': 'onehot_outlier_values'}) 

thr_sigma_info_3sigma = data_train.groupby(['零件号' ,'仓库'])['需求数量'].apply(find_outliers_3sigma).reset_index()
thr_sigma_info_3sigma = thr_sigma_info_3sigma.rename(columns={'需求数量': '3sigma_outlier_values'})  

thr_sigma_info_xxt = data_train.groupby(['零件号' ,'仓库'])['需求数量'].apply(find_outliers_xxt).reset_index()
thr_sigma_info_xxt = thr_sigma_info_xxt.rename(columns={'需求数量': 'xxt_outlier_values'})  


merged_df_onehot_sigma = pd.merge(thr_sigma_info_onehot, thr_sigma_info_3sigma,
                              on=['零件号' ,'仓库'],how ='left')
merged_df_full = pd.merge(merged_df_onehot_sigma, thr_sigma_info_xxt,
                                  on=['零件号' ,'仓库'],how ='left')



data_test_outlier_values = pd.merge(data_test, merged_df_full,
                                  on=['零件号' ,'仓库'],how ='left')

data_test_outlier_onehot = data_test_outlier_values.dropna(subset='onehot_outlier_values')
data_test_outlier_onehot = data_test_outlier_values.dropna(subset='onehot_outlier_values')
data_test_outlier_notonehot = data_test_outlier_values.dropna(subset='3sigma_outlier_values')

data_test_outlier_notonehot['箱型图阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
                                                 <= data_test_outlier_notonehot['xxt_outlier_values'], 1, 0)
data_test_outlier_notonehot['3sigma阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
                                                 <= data_test_outlier_notonehot['3sigma_outlier_values'], 1, 0)
data_test_outlier_onehot['onehot拦截情况'] = np.where(data_test_outlier_onehot['需求数量']
                                             <= data_test_outlier_onehot['onehot_outlier_values'], 1, 0) 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_outlier_notonehot['箱型图阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_outlier_notonehot['3sigma阈值拦截情况'] = np.where(data_test_outlier_notonehot['需求数量']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_outlier_onehot['

In [10]:
ll = data_test_outlier_notonehot[['零件号','仓库','经销商代码','需求数量','日期','3sigma阈值拦截情况']]
ll = ll[ll['3sigma阈值拦截情况']==0]

In [11]:
test = full_data_info[(full_data_info['日期']<'2023-01-31')]
test.sort_values(by=['零件号','仓库','经销商代码','日期'],inplace =True)
test['Last_Order_Time'] = test.groupby(['零件号','仓库','经销商代码']).日期.shift(1)  
test  = test[['零件号','仓库','经销商代码','需求数量','日期','Last_Order_Time']]
test = test[(test['日期']<=test_time)&(test['日期']>=end_time)].dropna(subset=['Last_Order_Time'])


ll_output = pd.merge(ll,test,on = ['零件号','仓库','经销商代码','需求数量','日期'],how ='left')
ll_output.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.sort_values(by=['零件号','仓库','经销商代码','日期'],inplace =True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Last_Order_Time'] = test.groupby(['零件号','仓库','经销商代码']).日期.shift(1)


Unnamed: 0,零件号,仓库,经销商代码,需求数量,日期,3sigma阈值拦截情况,Last_Order_Time
0,18D945105E,5001,10639,5,2023-01-09,0,2022-08-04
1,56D945105,5001,10639,5,2023-01-09,0,2022-07-04
2,N 90648704,8023,12726,210,2023-01-08,0,2023-01-04
3,56D945105,6000,13572,2,2023-01-09,0,
4,3Q0825236C,8023,13106,5,2023-01-08,0,2022-11-11


In [16]:
# full_data_info[(full_data_info['零件号']=='56D945105')&(full_data_info['仓库']=='6000')
#               &(full_data_info['经销商代码']==13572)]

Unnamed: 0,零件号,日期,需求数量,仓库,vbeln,posnr,经销商代码,需求频次abc分类,价格abc分类,需求数量ABC分类,零件代码第四位,EOP,计划员
14825,56D945105,2023-01-09,2,6000,12841178,5600,13572,A,D,FAST,9.0,1218.0,43
