In [None]:
import pandas as pd
import sqlalchemy
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sqlalchemy.exc import SQLAlchemyError
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *
from analyze_visualisation import *

In [None]:
path = "/home"
 
# Join various path components 
print(os.path.join(path, "User/Desktop", "file.txt"))

In [None]:
data = {}

try:
    sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
    engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
    for table, query in zip(dbtables, querys):
        data.update({table: pd.read_sql(query, engine)})
except SQLAlchemyError as e:
    print(e)

In [None]:
data = drop_unused_columns(data)

In [None]:
data.keys()

In [None]:
data['ONI_CIRCUITS'].shape

In [None]:
final_table = combine_final_table(data)

In [None]:
final_table.shape

In [None]:

final_table = create_final_status(final_table)
final_table = drop_columns_not_used_in_ml(final_table)

In [None]:
final_table.shape

In [None]:
final_table = read_csv('final_table_before_standarization.csv')

In [None]:
fig = px.box(final_table, y='vds_vac_hose1')
fig.update_layout(width=500, height=400, margin=dict(l=10, r=10, b=10, t=10), paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_traces(marker=dict(color='darkblue'), boxpoints='outliers')
fig.show()

In [None]:
fig.write_html(r"C:\Users\dlxpmx8\Desktop\Projekt_AI\meb_process_data_analysis\not_in_repo/box_plot_with_outliers.html")
fig.write_image(r"C:\Users\dlxpmx8\Desktop\Projekt_AI\meb_process_data_analysis\not_in_repo/box_plot_with_outliers.png", format="png")

In [None]:
final_table = read_csv('final_table_before_normalization.csv')

In [None]:
status_1_data = final_table[final_table['our_final_status'] == 0].sample(n=200, random_state= 69)
status_2_data = final_table[final_table['our_final_status'] == 1].sample(n=200, random_state= 69)
random_to_analyze = pd.concat([status_1_data, status_2_data], ignore_index=True)

col_dgm = ['cisnienie_koncowe','nachdruck_hub', 
                'anguss','vds_air_pressure','vds_vac_hose1', 'vds_vac_valve1', 'vds_vac_valve2']

make_and_save_pariplot(random_to_analyze, col_dgm, 'some_dgm_corr3.png')

In [None]:
final_table['our_final_status'].value_counts()

In [None]:
final_table = read_csv('final_table_before_normalization.csv')

In [None]:
final_table.corr()

In [None]:
correlation_matrix = final_table.corr()
threshold = 0.85  
high_corr_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            high_corr_features.add(colname)

In [None]:
pocz = [value for value in high_corr_features if value.startswith('flow')]
print(pocz)

In [None]:
'flow_13', 'flow_9', 'flow_2', 'flow_23'

In [None]:
print(len(high_corr_features))

In [None]:
final_table.shape

In [None]:
final_table_droped = final_table.drop(columns = high_corr_features)

In [None]:
final_table_droped.shape

In [None]:
save_df_to_csv(final_table_droped, 'final_table_droped_before_normalization.csv')

In [None]:
ml_data = split_data(final_table_droped)
ml_data['x_train'], scaler = normalize_data(ml_data['x_train'])
save_df_to_csv(ml_data['x_train'], 'x_train_droped.csv')
save_df_to_csv(ml_data['y_train'], 'y_train_droped.csv')
ml_data['x_valid'] = normalize_data(ml_data['x_valid'], scaler)
save_df_to_csv(ml_data['x_valid'], 'x_valid_droped.csv')
save_df_to_csv(ml_data['y_valid'], 'y_valid_droped.csv')
ml_data['x_test'] = normalize_data(ml_data['x_test'], scaler)
save_df_to_csv(ml_data['x_test'], 'x_test_droped.csv')
save_df_to_csv(ml_data['y_test'], 'y_test_droped.csv')

In [None]:
pocz = [value for value in final_table_droped if value.startswith('assigment') or value.startswith('working')]
print(pocz)

In [11]:
import pandas as pd
import sqlalchemy
from sqlalchemy.exc import SQLAlchemyError

from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *


def read_data_from_database1():
    print('Reading from database')
    data = {}

    try:
        sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
        engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
        for table, query in zip(dbtables, querys):
            data.update({table: pd.read_sql(query, engine)})
            print(f'Table {table} read')
    except SQLAlchemyError as e:
        print(e)

    data['MEB_DGM']['timestamp'] = pd.to_datetime(data['MEB_DGM']['timestamp'])
    filtered_data = data.copy()
    filtered_data['MEB_DGM'] = data['MEB_DGM'][(data['MEB_DGM']["timestamp"].dt.month >= 10) & (data['MEB_DGM']["timestamp"].dt.year >= 2023)]

    data['MEB_DGM'] = data['MEB_DGM'].sort_values(by='timestamp')
    data['MEB_DGM'] = data['MEB_DGM'].iloc[:-int(filtered_data.shape[0])]

    #data.drop(filtered_data, inplace= True)
    data = drop_unused_columns(data)
    filtered_data = drop_unused_columns(filtered_data)
    
    return data, filtered_data

In [12]:
data1, filtered1, amount1 = read_data_from_database1()

Reading from database
Table MEB_DGM read
Table MEB_DMC read
Table MEB_GROB read
Table MEB_KO read
Table MEB_KO_DGM read
Table MEB_KO_RODZAJ read
Table MEB_KO_STREFA read
Table MEB_KS read
Table ONI_CIRCUITS read


In [3]:
filtered1.shape[0]

69357

In [13]:
data1['MEB_DGM'].shape

(1403234, 24)

In [17]:
list(data1['MEB_DGM'].keys())

['id',
 'dmc',
 'nr_dgm',
 'status',
 'czas_fazy_1',
 'czas_fazy_2',
 'czas_fazy_3',
 'max_predkosc',
 'cisnienie_tloka',
 'cisnienie_koncowe',
 'nachdruck_hub',
 'anguss',
 'temp_pieca',
 'oni_temp_curr_f1',
 'oni_temp_fore_f1',
 'oni_temp_fore_f2',
 'vds_air_pressure',
 'vds_vac_hose1',
 'vds_vac_hose2',
 'vds_vac_tank',
 'vds_vac_valve1',
 'vds_vac_valve2',
 'czas_taktu',
 'rn']

In [14]:
amount1

(1472591, 31)

In [5]:
data1['MEB_DGM']['timestamp'].head()

0   2023-10-10 22:46:09
1   2023-10-02 02:38:51
2   2021-06-24 13:51:21
3   2023-07-17 02:13:01
4   2023-07-17 02:15:35
Name: timestamp, dtype: datetime64[ns]

In [6]:
data1['MEB_DGM'] = data1['MEB_DGM'].sort_values(by='timestamp')

In [7]:
data1['MEB_DGM']['timestamp'].head()

123512   2019-06-25 08:54:45
123513   2019-06-25 08:56:22
123514   2019-06-25 08:57:56
123515   2019-06-25 08:59:32
123516   2019-06-25 09:01:07
Name: timestamp, dtype: datetime64[ns]

In [8]:
data1['MEB_DGM'].shape

(1472580, 31)

In [9]:
data1['MEB_DGM'] = data1['MEB_DGM'].iloc[:-int(filtered1.shape[0])]

In [10]:
data1['MEB_DGM'].shape

(1403223, 31)

In [None]:
data1.value_counts()

In [None]:
filtered1.value_counts()

In [None]:
data1.keys()

In [None]:
dgm = data1['MEB_DGM']

In [None]:
dgm.keys()

In [None]:
dgm['timestamp'][0]

In [None]:
print(data)

In [None]:

filtered_data = dgm[(dgm['timestamp'].dt.month >= 10) & (dgm['timestamp'].dt.year >= 2023)]

In [None]:
print(filtered_data.head())

In [None]:
filtered_data['id'].value_counts()

In [None]:
filtered_data.shape

In [1]:
from main import *

data = read_data_from_database()

Reading from database
Table MEB_DGM read
Table MEB_DMC read
Table MEB_GROB read
Table MEB_KO read
Table MEB_KO_DGM read
Table MEB_KO_RODZAJ read
Table MEB_KO_STREFA read
Table MEB_KS read
Table ONI_CIRCUITS read


In [2]:
data.keys()

dict_keys(['MEB_DGM', 'MEB_DMC', 'MEB_GROB', 'MEB_KO', 'MEB_KO_DGM', 'MEB_KO_RODZAJ', 'MEB_KO_STREFA', 'MEB_KS', 'ONI_CIRCUITS'])

In [5]:
data['MEB_DGM'].shape

(1474007, 24)

In [7]:
for lol in data.values():
    print(lol.shape)

(1474007, 24)
(1235152, 3)
(1235165, 9)
(1212307, 5)
(36403, 8)
(13, 2)
(79, 2)
(1228978, 8)
(20449940, 8)


In [10]:
def combine_final_table1(data):

    # usuwanie znaków białych z DMC[MEB_DGM] i DMC_CASTING[MEB_DMC]
    data['MEB_DMC'].dmc_casting = data['MEB_DMC']['dmc_casting'].str.strip()
    data['MEB_DGM'].dmc = data['MEB_DGM']['dmc'].str.strip()

    # usuwanie z meb_dmc wierszy z 'WORKPIECE NIO' w kodzie DMC
    data['MEB_DMC'] = data['MEB_DMC'][~data['MEB_DMC']['dmc'].str.contains('WORKPIECE', case=False, na=False)]

    # wybieranie rekordów dla MEB+ 
    data['MEB_DGM'] = data['MEB_DGM'][(data['MEB_DGM']['nr_dgm'].between(8, 10)) & (data['MEB_DGM']['dmc'].apply(lambda x: len(str(x)) == 21))]

    # usunięcie anomalii z MEB_DMC
    data['MEB_DMC'] = data['MEB_DMC'][data['MEB_DMC']['dmc'].str[:3] == '0MH']

    # łączę tabele MEB_KO i MEB_KO_DGM z tabelami MEB_KO_STREFA/RODZAJ
    data['MEB_KO'] = data['MEB_KO'].merge(data['MEB_KO_STREFA'], left_on='nok_strefa', right_on='indeks', how='inner')
    data['MEB_KO'].drop(columns=['indeks'], inplace=True)
    data['MEB_KO'] = data['MEB_KO'].merge(data['MEB_KO_RODZAJ'], left_on='nok_rodzaj', right_on='indeks', how='inner')
    data['MEB_KO'].drop(columns=['indeks'], inplace=True)
    data['MEB_KO_DGM'] = data['MEB_KO_DGM'].merge(data['MEB_KO_STREFA'], left_on='nok_strefa', right_on='indeks', how='inner')
    data['MEB_KO_DGM'].drop(columns=['indeks'], inplace=True)
    data['MEB_KO_DGM'] = data['MEB_KO_DGM'].merge(data['MEB_KO_RODZAJ'], left_on='nok_rodzaj', right_on='indeks', how='inner')
    data['MEB_KO_DGM'].drop(columns=['indeks'], inplace=True)

    # łączę tabelę MEB_DMC z tabelą MEB_KO
    data['MEB_DMC'] = data['MEB_DMC'].merge(data['MEB_KO'], on='id_dmc', how='left')
    data['MEB_DMC'].drop(columns=['rn'], inplace=True)

    # łączę tabelę MEB_DMC z tabelą MEB_GROB
    data['MEB_DMC'] = data['MEB_DMC'].merge(data['MEB_GROB'], on='id_dmc', how='left')
    data['MEB_DMC'].drop(columns=['rn'], inplace=True)

    # łączę tabelę MEB_DMC z tabelą MEB_KS
    data['MEB_DMC'] = data['MEB_DMC'].merge(data['MEB_KS'], on='id_dmc', how='left')
    data['MEB_DMC'].drop(columns=['rn'], inplace=True)

    # przygotowywuję tabelę ONI_CIRCUITS do połączenia 
    oni_circuits = data['ONI_CIRCUITS'].pivot(index='id_dmc', columns='circuit_nr', values=['assigment', 'flow', 'set_point', 'start_delay', 'temp', 'working_mode'])
    oni_circuits.columns = oni_circuits.columns.map('{0[0]}_{0[1]}'.format) 
    oni_circuits.reset_index(inplace=True)

    final_table = data['MEB_DGM'].copy()
    final_table.drop(columns=['rn'], inplace=True)

    # łączę tabelę MEB_DGM z tabelą MEB_KO_DGM
    final_table = final_table.merge(data['MEB_KO_DGM'], left_on='id', right_on='id_dmc', how='left')
    final_table.drop(columns=['rn'], inplace=True)

    # łączę z tabelą MEB_DGM
    final_table = final_table.merge(oni_circuits, left_on='id', right_on='id_dmc', how='inner')
    final_table.drop(columns=['id_dmc_y'], inplace=True)
    final_table.rename(columns={'id_dmc_x': 'id_dmc'}, inplace=True)

    # łączę tabelę MEB_DMC z ONI_CIRCUITS
    final_table = final_table.merge(data['MEB_DMC'], left_on='dmc', right_on='dmc_casting', how='left', suffixes=('_DGM', '_DMC'))

    # duplicate_count_oni = final_table['dmc_DMC'].duplicated(keep=False).sum()
    # print(f"Number of rows with the same 'dmc' value: {duplicate_count_oni}")

    final_table.drop(columns=['nok_strefa_DGM', 'nok_rodzaj_DGM', 'status_ko_DGM', 'kod_pola_DGM', 'rodzaj_uszkodzenia_DGM'], inplace=True)
    final_table.rename(columns={'nok_strefa_DMC': 'nok_strefa', 'nok_rodzaj_DMC': 'nok_rodzaj', 
                                'status_ko_DMC': 'status_ko', 'kod_pola_DMC': 'kod_pola', 
                                'rodzaj_uszkodzenia_DMC': 'rodzaj_uszkodzenia'}, inplace=True)
                                
    final_table.drop(index=final_table[(final_table['dmc_DGM'].duplicated(keep=False)) & (~final_table['dmc_casting'].isna())].index, inplace=True)
    final_table.drop(columns = ['part_status'], inplace = True)

    return final_table

In [11]:
print('Combine final table')
final_table = combine_final_table1(data)

Combine final table


In [12]:
final_table.shape

(730328, 215)

In [14]:
final_table.head()

Unnamed: 0,id,dmc_DGM,nr_dgm,status,czas_fazy_1,czas_fazy_2,czas_fazy_3,max_predkosc,cisnienie_tloka,cisnienie_koncowe,...,temp_hydraulics,pressure_pcf_1,pressure_pcf_2,pressure_pcf_3,cisnienie,przeciek,nrprogramu,temperaturatestu,statusszczelnosc,statusdmc
0,315793,210623155611061147233,10.0,1,2277.0,90.0,45.0,6.3,5.0,282.0,...,40.200001,101.0,104.0,90.0,1.061,0.0,1.0,31.199999,1.0,1.0
1,315795,210623155710941036846,9.0,1,2220.0,76.0,16.0,5.0,4.0,281.0,...,39.5,99.0,99.0,88.0,1.06,0.006000000052154,1.0,30.799999,1.0,1.0
2,315816,210623155811061147234,10.0,1,2277.0,91.0,44.5,6.26,5.0,282.0,...,40.200001,101.0,104.0,90.0,1.057,0.0040000001899898,1.0,31.799999,1.0,1.0
3,315817,210623155911061147235,10.0,1,2275.0,89.0,43.5,6.37,5.0,282.0,...,39.400002,102.0,100.0,91.0,1.057,0.0,1.0,30.799999,1.0,1.0
4,315794,210623160111061147236,10.0,1,2276.0,91.0,44.5,6.37,5.0,282.0,...,39.599998,99.0,99.0,88.0,1.058,0.0,1.0,31.039999,1.0,1.0


In [9]:
for lol in data.values():
    print(lol.shape)

(1474007, 24)
(1234382, 3)
(1235165, 9)
(1212307, 5)
(36403, 8)
(13, 2)
(79, 2)
(1228978, 8)
(20449940, 8)
