In [1]:
import sys
import pandas as pd
import numpy as np
import sklearn
import pymongo
import warnings 
warnings.filterwarnings(action='ignore')

print('python ==> ',sys.version)
print('pandas ==>',pd.__version__)
print('numpy ==>',np.__version__)
print('sklearn ==>',sklearn.__version__)
print('Pymongo -->',pymongo.__version__)

python ==>  3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]
pandas ==> 2.2.3
numpy ==> 1.23.1
sklearn ==> 1.5.2
Pymongo --> 4.10.1


In [2]:
import os 
import re
import glob 
import joblib
import pickle 
import pandas as pd
import numpy as np
from numpy import array
import datetime
from datetime import datetime,timedelta
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.cluster import KMeans
from pymongo import MongoClient
import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
def complete_process(machine_number):
    server_address = "private_server_address"
    client_update = MongoClient("mongodb://private_company_client{0}/admin".format(server_address))
    db = client_update["privateDB"] # db 이름 변경 
    from_date = datetime(2025,1,1,0,0,0) 

    #### 1.) Production 데이터 수집 
    col = db['productionData']
    pd.set_option('display.max_columns', None)
    df_production = pd.DataFrame(list(col.find({"equipNumber": f"A01-0{str(machine_number)}",
                 "$and":[{"lineNumber":{"$ne":'MLT'}}],
                "$and":[{"timeStamp":{"$gte":from_date}}]}))).reset_index(drop=True)
    #df_production.to_csv('df_production.csv',encoding='cp949',index=False)
    df_production = df_production[df_production['timeStamp']>='2025-02-05 00:00:00'].reset_index(drop=True)
    df_production = df_production.sort_values(by='timeStamp').reset_index(drop=True)
    df_production = df_production[df_production['PassOrFail']=='1'].reset_index(drop=True)
    #print(df_production['timeStamp'] + timedelta(hours=9))

    #### 2.) Environment 데이터 수집 
    col = db['environmentData']
    pd.set_option('display.max_columns', None)
    df_environment = pd.DataFrame(list(col.find({"timeStamp":{"$gte":from_date}}))).reset_index(drop=True)
    df_environment = df_environment[df_environment['timeStamp']>='2025-02-05 00:00:00'].reset_index(drop=True)
    df_environment = df_environment.sort_values(by='timeStamp').reset_index(drop=True)
    #print(df_environment['timeStamp'] + timedelta(hours=9))

    #### 3.) Production + Environment 매칭 (시간 단위 1대1 매칭 불가 --> 최근접 시간 매칭 시도 )
    df_merged = pd.merge_asof(df_production,df_environment,on='timeStamp', direction='nearest')
    df_merged.shape
    print(df_merged.timeStamp+timedelta(hours=9))

    #### 4.) 총합 매칭된 데이터들의 입력변수 [Set + Environment] 기반 K-Means Cluster 모델 구축 
    set_environ_cols = df_merged.loc[:,df_merged.columns.str.contains('Set')].columns.tolist()+['factoryTemperature','factoryHumidity']
    n = len(df_merged[set_environ_cols].drop_duplicates())
    df_set = df_merged[set_environ_cols]
    pipe1 = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=n, random_state=0))])
    df_clustered_setting = pipe1.fit_predict(df_set)
    df_merged['cluster_no'] = df_clustered_setting
    cluster_settings = df_merged['cluster_no'].unique().tolist()
    cluster_settings.sort()
    print('Setting Clusters ==> ',len(cluster_settings))
    print('\n')
    joblib.dump(pipe1,'recipe_cluster/'+'environmental_setting_cluster_machine='+str(machine_number)+'.pkl')

    #### 5.) K-Means Cluster기반 개별 데이터 Cluster Numbering 완료한 Recipe Analysis Data 저장 
    df_merged[set_environ_cols+['cluster_no']].drop_duplicates().reset_index(drop=True).to_csv('recipe_data/Recipe_Data_machine='+str(machine_number)+'.csv',index=False,encoding='cp949')
    print(df_merged[set_environ_cols+['cluster_no']].drop_duplicates().shape)

#### 사출기 17,18호기 대상 각각 K-Means Cluster 학습모델 저장  

In [4]:
complete_process(machine_number=17)

0       2025-02-17 08:08:19.084
1       2025-02-17 08:09:01.564
2       2025-02-17 08:09:49.654
3       2025-02-17 08:11:22.955
4       2025-02-17 08:12:05.554
                  ...          
23790   2025-04-15 09:25:25.505
23791   2025-04-15 09:26:29.241
23792   2025-04-15 09:27:28.787
23793   2025-04-15 09:28:28.170
23794   2025-04-15 09:29:27.598
Name: timeStamp, Length: 23795, dtype: datetime64[ns]
Setting Clusters ==>  19578


(19578, 11)


In [5]:
complete_process(machine_number=18)

0       2025-02-05 09:01:05.964
1       2025-02-05 09:02:12.910
2       2025-02-05 09:03:21.031
3       2025-02-05 09:04:29.119
4       2025-02-05 09:05:36.033
                  ...          
23271   2025-04-15 09:29:22.115
23272   2025-04-15 09:30:28.019
23273   2025-04-15 09:30:28.019
23274   2025-04-15 09:31:32.896
23275   2025-04-15 09:31:32.896
Name: timeStamp, Length: 23276, dtype: datetime64[ns]
Setting Clusters ==>  340


(340, 11)
