In [1]:
import sys
import pandas as pd
import numpy as np
import sklearn
import pymongo
import warnings 
warnings.filterwarnings(action='ignore')

print('python ==> ',sys.version)
print('pandas ==>',pd.__version__)
print('numpy ==>',np.__version__)
print('sklearn ==>',sklearn.__version__)
print('Pymongo -->',pymongo.__version__)

python ==>  3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]
pandas ==> 2.2.3
numpy ==> 1.23.1
sklearn ==> 1.5.2
Pymongo --> 4.10.1


In [2]:
import os 
import re
import glob 
import joblib
import pickle 
from scipy import stats
import pandas as pd
import numpy as np
from numpy import array
import datetime
from datetime import datetime, timedelta, timezone
from zoneinfo import ZoneInfo
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.cluster import KMeans
from pymongo import MongoClient
import warnings 
warnings.filterwarnings(action='ignore')

#### 1.) 실시간 수집 데이터에 대한 K-Means예측 기반 InjectionTime/CoolingTime 추론 함수 

In [4]:
def recipe_analysis(workingNumber):
    #### 1.) 사출기 17,18호기 대상 MongoDB 실시간 데이터 동시 수집 
    server_address = "private_server_address"
    client_update = MongoClient("mongodb://private_company_client@{0}/admin".format(server_address))
    db = client_update["privateDB"] # db 이름 변경 
    col1 = db['productionData']
    col2 = db['environmentData']
    utc_now = datetime.now(ZoneInfo("UTC"))
    from_date_UTC = datetime(utc_now.year, utc_now.month, utc_now.day, tzinfo=ZoneInfo("UTC"))
    kst_now = utc_now.astimezone(ZoneInfo("Asia/Seoul"))
    from_date_KST = datetime(kst_now.year, kst_now.month, kst_now.day,tzinfo=timezone(timedelta(hours=9)))
    
    #### 2.) Load production (KST or UTC 기준 실시간 수집 자동 선택 )
    df_production_UTC = pd.DataFrame(list(col1.find({"equipNumber": {"$in": ["A01-017", "A01-018"]},
                                          "lineNumber": {"$ne": "MLT"},
                                          "timeStamp": {"$gte": from_date_UTC}}).sort("timeStamp", -1).limit(200))).reset_index(drop=True)

    df_production_KST = pd.DataFrame(list(col1.find({"equipNumber": {"$in": ["A01-017", "A01-018"]},
                                              "lineNumber": {"$ne": "MLT"},
                                              "timeStamp": {"$gte": from_date_KST}}).sort("timeStamp", -1).limit(200))).reset_index(drop=True)
    if len(df_production_KST) > len(df_production_UTC):
        df_production = df_production_KST
        from_date = from_date_KST
    else:
        df_production = df_production_UTC
        from_date = from_date_UTC
    df_production['timeStamp'] = df_production['timeStamp']+timedelta(hours=9)
    df_production['ReceivedDateTime'] = df_production['ReceivedDateTime']+timedelta(hours=9)
    df_production['Ymdate'] = df_production['Ymdate']+timedelta(hours=9)
    df_production = df_production.sort_values(by='timeStamp').reset_index(drop=True)
    df_production_17 = df_production[df_production['equipNumber']=='A01-017'].reset_index(drop=True)
    df_production_18 = df_production[df_production['equipNumber']=='A01-018'].reset_index(drop=True)
    
    #### 3.) Load environment
    from_date = from_date - timedelta(days=5)
    df_environment = pd.DataFrame(list(col2.find({"timeStamp":{"$gte":from_date}}).sort('timeStamp',-1).limit(200))).reset_index(drop=True)
    df_environment['timeStamp'] = df_environment['timeStamp']+timedelta(hours=9)
    df_environment['ReceivedDateTime'] = df_environment['ReceivedDateTime']+timedelta(hours=9)
    df_environment = df_environment.sort_values(by='timeStamp').reset_index(drop=True)
    
    #### 4.) merge [ set + environment info ] 
    df_merged_17 = pd.merge_asof(df_production_17,df_environment,on='timeStamp', direction='nearest')
    df_merged_17 = df_merged_17.sort_values(by='timeStamp').reset_index(drop=True)
    df_merged_18 = pd.merge_asof(df_production_18,df_environment,on='timeStamp', direction='nearest')
    df_merged_18 = df_merged_18.sort_values(by='timeStamp').reset_index(drop=True)
    print('machine 17 ==> ',df_merged_17.shape)
    print('machine_18 ==>',df_merged_18.shape)
    
    #### 4.) Find WorkingNumber --> [ A01-017 , A01-018 ] 중 1개에 해당하는 최근 1개 실시간 데이터 정의 
    df_input = df_merged_17[df_merged_17['workingNumber_x']==workingNumber].reset_index(drop=True)
    if df_input.empty:
        df_input = df_merged_18[df_merged_18['workingNumber_x']==workingNumber].reset_index(drop=True)
    if len(df_input)!=0:
        print('workingNumber Exist')
        equip_number = df_input['equipNumber_x'].iloc[-1]
        if equip_number=='A01-017':
            machine_number = 17
        elif equip_number=='A01-018':
            machine_number = 18
        else:
            machine_number = 17 or 18
        print('machine_number =', machine_number)

        #### 5.) 해당 사출기(17or18) K-Means Cluster 학습모델로 실시간 데이터의 군집 예측  & Recipe Analysis Data 업로드 
        set_environ_cols = df_input.loc[:,df_input.columns.str.contains('Set')].columns.tolist()+['factoryTemperature','factoryHumidity']
        saved_kmeans = 'recipe_cluster/'+'environmental_setting_cluster_machine='+str(machine_number)+'.pkl'
        load_model = joblib.load(saved_kmeans)
        predicted_cluster = load_model.predict(df_input[set_environ_cols])[0]
        df_input['cluster_no'] = predicted_cluster
        chunks = pd.read_csv('recipe_data/Recipe_Data_machine='+str(machine_number)+'.csv',encoding='cp949',chunksize=1000)
        df_list = []  
        for chunk in chunks:
            df_list.append(chunk)  
        load_recipe_data = pd.concat(df_list,ignore_index=True)

        #### 6.) Recipe Analysis Data에서 현재 예측된 군집에 해당하는 부분집합 선택  
        print('Search via cluster')
        choose_data = load_recipe_data[load_recipe_data['cluster_no']==predicted_cluster].reset_index(drop=True)
        if len(choose_data)>1: 
            #### 부분집합 개수 다수일때 현재 입력값의 Injection/CoolingTimeSet_mean과 최근접 데이터 선택 
            print('calculate value diffs')
            target = df_input[['InjectionTimeSet_mean', 'CoolingTimeSet_mean']].values[0]  
            candidates = choose_data[['InjectionTimeSet_mean', 'CoolingTimeSet_mean']].values 
            diffs = np.abs(candidates - target)  
            total_diffs = np.sum(diffs, axis=1)  
            best_index = np.argmin(total_diffs)
            best_candidate = choose_data.iloc[best_index]
            choose_data = pd.DataFrame(best_candidate).T.reset_index(drop=True)
        else:
            #### 부분집합 개수 1개일 때 Injection/CoolingTimeSet_mean 그대로 선택 
            choose_data = choose_data.sample(n=1).reset_index(drop=True)

        #### 7.) 부분집합의 Injectiontime/Coolingtime의 표준편차 0 아닐 시 Injection/CoolingTimeSet_mean ± Std 사이값 출력 
        if choose_data['InjectionTimeSet_std'].iloc[-1]!=0:
            if choose_data['InjectionTimeSet_std'].iloc[-1]<=2.0:
                print('generate injectiontime mean ± 1std')
                lower_val = choose_data['InjectionTimeSet_mean'].iloc[-1] - choose_data['InjectionTimeSet_std'].iloc[-1]
                upper_val = choose_data['InjectionTimeSet_mean'].iloc[-1] + choose_data['InjectionTimeSet_std'].iloc[-1]
                injectiontime = np.random.uniform(lower_val,upper_val)
            else:
                injectiontime = choose_data['InjectionTimeSet_mean'].iloc[-1]
        else:
            injectiontime = choose_data['InjectionTimeSet_mean'].iloc[-1]
        if choose_data['CoolingTimeSet_std'].iloc[-1]!=0:
            if choose_data['CoolingTimeSet_std'].iloc[-1]<=2.0:
                print('generate coolingtime mean ± 1std')
                lower_val = choose_data['CoolingTimeSet_mean'].iloc[-1] - choose_data['CoolingTimeSet_std'].iloc[-1]
                upper_val = choose_data['CoolingTimeSet_mean'].iloc[-1] + choose_data['CoolingTimeSet_std'].iloc[-1]
                coolingtime = np.random.uniform(lower_val,upper_val)
            else:
                coolingtime = choose_data['CoolingTimeSet_mean'].iloc[-1]
        else:
            coolingtime = choose_data['CoolingTimeSet_mean'].iloc[-1]
            
        #### 4.) 현재 데이터로 Recipe Data Update
        print('Update Recipe Data')
        update_inputs =  df_input[load_recipe_data.columns.tolist()]
        save_data = pd.concat([load_recipe_data,update_inputs],axis=0).reset_index(drop=True)
        save_data = save_data.drop_duplicates().reset_index(drop=True)
        save_data.to_csv('recipe_data/Recipe_Data_machine='+str(machine_number)+'.csv',encoding='cp949',index=False)  

        #### 5.) 현재 실시간 수집값과의 비교 토대로 최적 InjectionTime/ColingTimeSet Mean값 2차 보정 
        if (injectiontime==df_input['InjectionTimeSet_mean'].iloc[-1]) or (np.abs(injectiontime - df_input['InjectionTimeSet_mean'].iloc[-1])>5):
            gaussain_noise = np.random.normal(loc=0.0, scale=0.5, size=(100,))
            chosen_noise = np.random.choice(gaussain_noise)
            injectiontime = injectiontime + chosen_noise
        else:
            pass
        if (coolingtime==df_input['CoolingTimeSet_mean'].iloc[-1]) or (np.abs(coolingtime - df_input['CoolingTimeSet_mean'].iloc[-1])>5):
            gaussain_noise = np.random.normal(loc=0.0, scale=0.5, size=(100,))
            chosen_noise = np.random.choice(gaussain_noise)
            coolingtime = coolingtime + chosen_noise
        else:
            pass
        print('current set==>',[df_input['InjectionTimeSet_mean'].iloc[-1],df_input['CoolingTimeSet_mean'].iloc[-1]])
        print('pred set==>',[np.round(injectiontime,3),np.round(coolingtime,3)])
    else:
        #### 6.)  데이터 수집 & 매칭 과정에서 오류 발생 시 예외처리 적용 --> Production데이터기준 마지막 30개 Injection/CoolingSet의 평균값 대체 
        print('workingNumber Not Exist ==> recent 30 mean ') 
        coolingtime = df_production['CoolingTimeSet_mean'].tail(30).mean()
        injectiontime = df_production['InjectionTimeSet_mean'].tail(30).mean()
    columns = ['injectiontime','coolingtime']
    data = [(np.round(injectiontime,3),np.round(coolingtime,3))]
    df_print = pd.DataFrame(data,columns=columns)
    result = df_print.iloc[-1:].to_dict('records')[0] 
    values = {"InjectionTime":result['injectiontime'],"CoolingTime":result['coolingtime']}
    return values 

#### 2.) Prediction 함수 작동 예시 

In [5]:
def prediction(workingNumber):
    try:
        values = recipe_analysis(workingNumber)
    except:
        values = load_merged_production()
    return values

In [7]:
prediction(workingNumber='250425122214453')

machine 17 ==>  (119, 69)
machine_18 ==> (81, 69)
workingNumber Not Exist ==> recent 30 mean 


{'InjectionTime': 290.392, 'CoolingTime': 76.599}

In [8]:
prediction(workingNumber='250502100105147')

machine 17 ==>  (119, 69)
machine_18 ==> (81, 69)
workingNumber Exist
machine_number = 17
Search via cluster
calculate value diffs
generate injectiontime mean ± 1std
generate coolingtime mean ± 1std
Update Recipe Data
current set==> [244.83673469387756, 57.42857142857143]
pred set==> [245.202, 56.746]


{'InjectionTime': 245.202, 'CoolingTime': 56.746}