In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVR, OneClassSVM

In [4]:
df = pd.read_csv(r"C:/Users/User/Desktop/leo/competition_leo/決賽/data/anomaly_train1.csv")
df.head()

Unnamed: 0,date,oven_id,layer_id,lamp_id,anomaly_accumulation_hour,anomaly_total_number
0,2021/12/27,1B0,5,26_49,5116,2
1,2021/12/27,1C0,3,45_91,4699,2
2,2021/12/27,1D0,14,64,3241,1
3,2021/12/27,1E0,1,96,4138,1
4,2021/12/27,1E0,8,51,3818,1


## 資料處理

#### lamp_id紀錄個數與total_number不合 -> 由 lamp_id 推估
- 不同的那一層會不會常常發生 -> 紀錄有誤()

In [30]:
df["total_number"]= list(map(lambda item : len(item.split('_')), df["lamp_id"].values))

In [35]:
df[df["total_number"] != df["anomaly_total_number"]]

Unnamed: 0,date,oven_id,layer_id,lamp_id,anomaly_accumulation_hour,anomaly_total_number,total_number
184,2022/3/26,1B0,1,57_74,7085,3,2
231,2022/4/7,2E0,11,6_13_41,3459,4,3
252,2022/4/14,2B0,4,18_29_115,5354,4,3


#### 異常發生時間比總時間還長 -> 將資料刪除

In [38]:
hour1df = pd.read_csv(r"C:/Users/User/Desktop/leo/competition_leo/決賽/data/accumulation_hour1.csv")
hour1df.head()

Unnamed: 0,date,oven_id,layer_id,accumulation_hour
0,2022/5/4,1B0,1,7731
1,2022/5/4,1B0,2,6388
2,2022/5/4,1B0,3,7792
3,2022/5/4,1B0,4,6942
4,2022/5/4,1B0,5,7361


In [43]:
dfjoinhour1df = pd.merge(df, hour1df, how="left", left_on=["oven_id","layer_id"], right_on=["oven_id","layer_id"])

In [50]:
dropindex = dfjoinhour1df[dfjoinhour1df["anomaly_accumulation_hour"]>dfjoinhour1df["accumulation_hour"]].index

In [54]:
dfjoinhour1df =dfjoinhour1df.drop(dropindex)

(347, 9)

## 模型建立

In [64]:
#取得各爐預測燈管壞掉數量之模型
resmodel = {}
ovenList = list(dfjoinhour1df["oven_id"].unique())

for i in ovenList:
    temp = dfjoinhour1df[dfjoinhour1df.loc[:,"oven_id"]==i].sort_values(by=["anomaly_accumulation_hour"], ignore_index=True)
    temp["total_number_cumsum"] = temp["total_number"].cumsum()
    svr = SVR(kernel='poly')
    resmodel[i] = svr.fit(temp["anomaly_accumulation_hour"].values.reshape(-1,1),temp["total_number_cumsum"])

In [72]:
resmodel["2G0"]=0
#2G0預測為0

In [73]:
# 取得各爐各層之異常發生頻率(84%異常資料)、個數及最大使用時數

resStd = {}
for ovenid in resmodel:
    
    #print(ovenid)
    temp = dfjoinhour1df[dfjoinhour1df.loc[:,"oven_id"]==ovenid].sort_values(by=["anomaly_accumulation_hour"], ignore_index=True)
    for i in range(1,19):
        
        objid = ovenid+'_'+str(i)
        resStd[objid]={}
        tempdf = temp[temp.loc[:,"layer_id"]==i]
        
        if len(tempdf) > 3 :
            tempdata = tempdf["anomaly_accumulation_hour"].diff().dropna().values
            resStd[objid]["accumulation"] = tempdf["anomaly_accumulation_hour"].iloc[-1]
            tempmean = np.mean(tempdata)
            
            tempstd  = np.std(tempdata)
            if (tempmean-tempstd)<0: #代表 久久發生一次,但一發生會頻繁發生
                resStd[objid]["freq"] = np.min(tempdata)
            else:
                resStd[objid]["freq"]=tempmean-tempstd
            
            resStd[objid]["avgcount"] = round(np.mean(tempdf["total_number"].values))
        else:
            resStd[objid]["accumulation"]=0
            resStd[objid]["freq"]=0
            resStd[objid]["avgcount"]=0

In [74]:
resStd

{'1B0_1': {'accumulation': 7703, 'freq': 170.14809899439626, 'avgcount': 2},
 '1B0_2': {'accumulation': 6293, 'freq': 246.59088343936088, 'avgcount': 2},
 '1B0_3': {'accumulation': 7599, 'freq': 155.03625744482122, 'avgcount': 3},
 '1B0_4': {'accumulation': 6846, 'freq': 17.0, 'avgcount': 1},
 '1B0_5': {'accumulation': 7333, 'freq': 183.00274728096997, 'avgcount': 2},
 '1B0_6': {'accumulation': 7534, 'freq': 311.4425161205584, 'avgcount': 3},
 '1B0_7': {'accumulation': 0, 'freq': 0, 'avgcount': 0},
 '1B0_8': {'accumulation': 7469, 'freq': 150.87543895233426, 'avgcount': 3},
 '1B0_9': {'accumulation': 7183, 'freq': 36.0, 'avgcount': 2},
 '1B0_10': {'accumulation': 0, 'freq': 0, 'avgcount': 0},
 '1B0_11': {'accumulation': 7340, 'freq': 159.5487673305309, 'avgcount': 3},
 '1B0_12': {'accumulation': 7417, 'freq': 312.9652532960105, 'avgcount': 3},
 '1B0_13': {'accumulation': 0, 'freq': 0, 'avgcount': 0},
 '1B0_14': {'accumulation': 7025, 'freq': 92.53876770530883, 'avgcount': 3},
 '1B0_15'

## reference

In [76]:
hour0df = pd.read_csv(r"C:/Users/User/Desktop/leo/competition_leo/決賽/data/accumulation_hour0.csv")
hour1df = pd.read_csv(r"C:/Users/User/Desktop/leo/competition_leo/決賽/data/accumulation_hour1.csv")

In [80]:
hour0df = hour0df.rename(columns={"accumulation_hour": "stime"})
hour1df = hour1df.rename(columns={"accumulation_hour": "etime"})

In [84]:
testdf = pd.merge(hour0df, hour1df, how="left", left_on=["oven_id","layer_id"], right_on=["oven_id","layer_id"])
testdf["interval"] = testdf["etime"]-testdf["stime"]
testdf.head()

Unnamed: 0,date_x,oven_id,layer_id,stime,date_y,etime,interval
0,2022/5/4,1B0,1,7011,2022/5/4,7731,720
1,2022/5/4,1B0,2,5668,2022/5/4,6388,720
2,2022/5/4,1B0,3,7072,2022/5/4,7792,720
3,2022/5/4,1B0,4,6222,2022/5/4,6942,720
4,2022/5/4,1B0,5,6641,2022/5/4,7361,720
