In [None]:
import pandas as pd
import numpy as np
import sqlalchemy
import seaborn as sns
from sqlalchemy.exc import SQLAlchemyError
from ml_functions import *
from sklearn.metrics import accuracy_score, recall_score
from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *
import mlflow
import mlflow.pyfunc
import xgboost as xgb
from main import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Function to download whole table

In [None]:
def combine_final_whole_table(data_, dgm_smallest = 8, dgm_biggest = 10):

    data = data_.copy()
    # usuwanie znaków białych z DMC[MEB_DGM] i DMC_CASTING[MEB_DMC]
    data['MEB_DMC'].dmc_casting = data['MEB_DMC']['dmc_casting'].str.strip()
    data['MEB_DGM'].dmc = data['MEB_DGM']['dmc'].str.strip()

    # usuwanie z meb_dmc wierszy z 'WORKPIECE NIO' w kodzie DMC
    data['MEB_DMC'] = data['MEB_DMC'][~data['MEB_DMC']['dmc'].str.contains('WORKPIECE', case=False, na=False)]

    # wybieranie rekordów dla MEB+ 
    data['MEB_DGM'] = data['MEB_DGM'][(data['MEB_DGM']['nr_dgm'].between(dgm_smallest, dgm_biggest)) & (data['MEB_DGM']['dmc'].apply(lambda x: len(str(x)) == 21))]
    # usunięcie anomalii z MEB_DMC
    data['MEB_DMC'] = data['MEB_DMC'][data['MEB_DMC']['dmc'].str[:3] == '0MH']

    # łączę tabele MEB_KO i MEB_KO_DGM z tabelami MEB_KO_STREFA/RODZAJ
    data['MEB_KO'] = data['MEB_KO'].merge(data['MEB_KO_STREFA'], left_on='nok_strefa', right_on='indeks', how='inner')
    data['MEB_KO'].drop(columns=['indeks'], inplace=True)
    data['MEB_KO'] = data['MEB_KO'].merge(data['MEB_KO_RODZAJ'], left_on='nok_rodzaj', right_on='indeks', how='inner')
    data['MEB_KO'].drop(columns=['indeks'], inplace=True)
    data['MEB_KO_DGM'] = data['MEB_KO_DGM'].merge(data['MEB_KO_STREFA'], left_on='nok_strefa', right_on='indeks', how='inner')
    data['MEB_KO_DGM'].drop(columns=['indeks'], inplace=True)
    data['MEB_KO_DGM'] = data['MEB_KO_DGM'].merge(data['MEB_KO_RODZAJ'], left_on='nok_rodzaj', right_on='indeks', how='inner')
    data['MEB_KO_DGM'].drop(columns=['indeks'], inplace=True)

    # łączę tabelę MEB_DMC z tabelą MEB_KO
    data['MEB_DMC'] = data['MEB_DMC'].merge(data['MEB_KO'], on='id_dmc', how='left')
    data['MEB_DMC'].drop(columns=['rn'], inplace=True)

    # łączę tabelę MEB_DMC z tabelą MEB_GROB
    data['MEB_DMC'] = data['MEB_DMC'].merge(data['MEB_GROB'], on='id_dmc', how='left')
    data['MEB_DMC'].drop(columns=['rn'], inplace=True)

    # łączę tabelę MEB_DMC z tabelą MEB_KS
    data['MEB_DMC'] = data['MEB_DMC'].merge(data['MEB_KS'], on='id_dmc', how='left')
    data['MEB_DMC'].drop(columns=['rn'], inplace=True)

    # przygotowywuję tabelę ONI_CIRCUITS do połączenia
    data['ONI_CIRCUITS'].drop(columns = ['assigment', 'working_mode', 'set_point'], inplace = True)
    oni_circuits = data['ONI_CIRCUITS'].pivot(index='id_dmc', columns='circuit_nr', values=['flow', 'start_delay', 'temp'])
    oni_circuits.columns = oni_circuits.columns.map('{0[0]}_{0[1]}'.format) 
    oni_circuits.reset_index(inplace=True)

    final_table = data['MEB_DGM'].copy()
    final_table.drop(columns=['rn'], inplace=True)

    # łączę tabelę MEB_DGM z tabelą MEB_KO_DGM
    final_table = final_table.merge(data['MEB_KO_DGM'], left_on='id', right_on='id_dmc', how='left')
    final_table.drop(columns=['rn'], inplace=True)

    # łączę z tabelą MEB_DGM
    final_table = final_table.merge(oni_circuits, left_on='id', right_on='id_dmc', how='inner')
    final_table.drop(columns=['id_dmc_y'], inplace=True)
    final_table.rename(columns={'id_dmc_x': 'id_dmc'}, inplace=True)

    # łączę tabelę MEB_DMC z ONI_CIRCUITS
    final_table = final_table.merge(data['MEB_DMC'], left_on='dmc', right_on='dmc_casting', how='left', suffixes=('_DGM', '_DMC'))

    # duplicate_count_oni = final_table['dmc_DMC'].duplicated(keep=False).sum()
    # print(f"Number of rows with the same 'dmc' value: {duplicate_count_oni}")

    #final_table.drop(columns=['nok_strefa_DGM', 'nok_rodzaj_DGM', 'status_ko_DGM', 'kod_pola_DGM', 'rodzaj_uszkodzenia_DGM'], inplace=True)
    # final_table.rename(columns={'nok_strefa_DMC': 'nok_strefa', 'nok_rodzaj_DMC': 'nok_rodzaj', 
    #                             'status_ko_DMC': 'status_ko', 'kod_pola_DMC': 'kod_pola', 
    #                             'rodzaj_uszkodzenia_DMC': 'rodzaj_uszkodzenia'}, inplace=True)
                                
    final_table.drop(index=final_table[(final_table['dmc_DGM'].duplicated(keep=False)) & (~final_table['dmc_casting'].isna())].index, inplace=True)
    final_table.drop(columns = ['part_status'], inplace = True)

    return final_table

In [None]:
data = read_data_from_database()
whole_table = combine_final_whole_table(data, 9, 10)

In [None]:
print('Save whole table')
save_df_to_csv(whole_table, 'final_whole_table.csv')

## Operations on table

In [None]:
whole_table = load_csv('final_whole_table.csv')

In [None]:
whole_table.head()

In [None]:
whole_table['data_odlania'] = pd.to_datetime(whole_table['data_odlania'])
filtered_data = whole_table[whole_table['data_odlania'].dt.year >= 2023]

In [None]:
filtered_data.head()

In [None]:
whole_table['status'].value_counts()

In [None]:
whole_table['status_ko_DGM'].value_counts()

In [None]:
whole_table['status_ko_DMC'].value_counts()     #status KO - 2 to źle i to się łączy z nok_rodzaj i to po indeksie z rodzaj_uszkodzenia

In [None]:
whole_table['rodzaj_uszkodzenia_DMC'].value_counts()

In [None]:
whole_table['nok_rodzaj_DMC'].value_counts() # MEB_KO_RODZAJ i tam po indeksie
# Do zostawienia: 0, 102, 201, 103, 101, 

In [None]:
whole_table['statusdmc'].value_counts()

In [None]:
whole_table['status_ko_DMC'].value_counts()

In [None]:
whole_table['statusszczelnosc'].value_counts()

In [None]:
whole_table['status'].value_counts()

In [None]:
# & 
#                              (~((whole_table['nok_rodzaj_DMC'] == 0.0) | whole_table['nok_rodzaj_DMC'].isna()))
whole_table['data_odlania'] = pd.to_datetime(whole_table['data_odlania'])
filtered_table = whole_table[(whole_table['nr_dgm'].isin([9, 10])) & 
                             (whole_table['data_odlania'].dt.year < 2024) 
                             ]

print(f"Number of rows after initial filtering: {len(filtered_table)}")
print(filtered_table['status'].value_counts())
print('############')
#filtered_table = filtered_table[~filtered_table['status'].isin([4, 5, 7, 8, 10, 11])]
filtered_table = filtered_table[filtered_table['status'].isin([0,1,3,14])]
print(filtered_table['status'].value_counts())
print('############')
filtered_table['status'] = filtered_table['status'].replace([3, 14], 2)
print(filtered_table['status'].value_counts())
print(f"Number of rows after status filtering: {len(filtered_table)}")


result_table = filtered_table[['data_odlania', 'status', 'status_ko_DGM', 'status_ko_DMC', 'rodzaj_uszkodzenia_DMC', 'nok_rodzaj_DMC']]

In [None]:
# result_table[result_table['status'] == '2'].tail(100)
result_table.head(10)

In [None]:
whole_table.head()

In [None]:
whole_table[['data_odlania','status','status_ko_DGM', 'status_ko_DMC', 'rodzaj_uszkodzenia_DMC','nok_rodzaj_DMC']].tail(100)

## Testing the values of OK and NOK parts

In [None]:
whole_table = load_csv('final_whole_table.csv')
whole_table['data_odlania'] = pd.to_datetime(whole_table['data_odlania'])
check_df = whole_table[whole_table['nr_dgm'] == 10]
check_df = check_df[(check_df['data_odlania'].dt.month >= 10) & (check_df['data_odlania'].dt.year >= 2023)]

In [None]:
def create_final_status_check(final_table):
    
    final_table['status'] = final_table['status'].astype(int)
    final_table['nok_rodzaj_DMC'] = final_table['nok_rodzaj_DMC'].fillna(0)
    #final_table['nok_rodzaj'] = final_table['nok_rodzaj'].astype(int)
    
    print(final_table['status'].value_counts())
    final_table = final_table[final_table['status'].isin([0,1,3,14])]
    final_table['status'] = final_table['status'].replace([3, 14], 2)

    print('####################')
    print('status counts')
    print(final_table['status'].value_counts())

    final_table = final_table.loc[~final_table['status_ko_DMC'].isin([0, 106])] # KO
    final_table = final_table.loc[~final_table['statusszczelnosc'].isin([0, 3])]
    final_table = final_table.loc[~final_table['statusdmc'].isin([0,2])]
    print(f'Number of NOK parts on DGM: {final_table["status"].isin([2]).sum()}')
    print(f'Number of NOK parts of DGM on KO: {final_table["nok_rodzaj_DMC"].isin([102, 201, 103, 101]).sum()}')

    print('####################')
    print('nok_rodzaj counts')
    print(final_table['nok_rodzaj_DMC'].value_counts())

    final_table = final_table.loc[final_table['nok_rodzaj_DMC'].isin([0, 102, 201, 103, 101])]
    final_table['nok_rodzaj_DMC'] = final_table['nok_rodzaj_DMC'].replace([102, 201, 103, 101], 2)
    final_table['nok_rodzaj_DMC'] = final_table['nok_rodzaj_DMC'].replace([0], 1)

    print('####################')
    print('nok_rodzaj_DMC counts')
    print(final_table['nok_rodzaj_DMC'].value_counts())

    print('####################')
    print('status counts 2')
    print(final_table['status'].value_counts())
    
    final_table['our_final_status'] = final_table.apply(lambda row: max(row['status'], row['nok_rodzaj_DMC'], row['statusszczelnosc'], row['statusdmc']), axis=1)
    print(f"Final number of NOK parts: {final_table['our_final_status'].value_counts()}")

    return final_table

In [None]:
check_df = create_final_status_check(check_df)

In [None]:
check_df[['id','data_odlania','our_final_status','status','status_ko_DGM', 'status_ko_DMC', 'rodzaj_uszkodzenia_DMC','nok_rodzaj_DMC']]

## Analiza danych

In [7]:
check_df['rodzaj_uszkodzenia_DMC'].value_counts()

rodzaj_uszkodzenia_DMC
Brak                          4363
Odlewnicze - porowatości        50
Odlewnicze - brak naddatku      32
Odlewnicze - wtrącenia           3
Mechaniczne - odlewnicze         3
Name: count, dtype: int64

## Dane dla porowatosci wybranych detali

In [None]:
def make_and_save_pariplot_check(df_to_plot ,columns_to_analyze, file_name):

    plot = sns.pairplot(data=df_to_plot[columns_to_analyze + ['our_final_status']], hue="our_final_status", palette={ 1 : "green", 2:"red"}, plot_kws={"s": 3})
    plt.legend(title='status końcowy')
    plot.savefig(os.path.join(r"C:\Users\DLXPMX8\Desktop\Projekt_AI\meb_process_data_analysis\src\plots", file_name))

In [None]:
data_october_10_10_v2 = load_csv('final_table_10_10_v2_2021.csv')
data_october_10_10_v1 = load_csv('final_table_10_10_v1_2021.csv')

In [None]:
id_list = [1411700,1411776,1411795,1411841,1415944,1411935]
check_df.reset_index(drop=True, inplace=True)

filtered_df_test = check_df[check_df['id'].isin(id_list)]

selected_indices = []

for id_val in id_list:
    indices = filtered_df_test.index[filtered_df_test['id'] == id_val].tolist()
    selected_indices.extend(indices)

final_indices = []
for idx in selected_indices:
    lower_bound = max(0, idx - 2)  
    upper_bound = min(check_df.shape[0] - 1, idx + 2)  
    final_indices.extend(range(lower_bound, upper_bound + 1))

final_indices = list(set(final_indices) & set(check_df.index))

result_df = check_df.iloc[final_indices]


In [None]:
result_df['our_final_status'].value_counts()

In [None]:
columns_needed = list(data_october_10_10_v1.columns)
columns_to_drop = result_df.columns.difference(columns_needed)
tab_to_plot = result_df.drop(columns=columns_to_drop)

In [None]:
column_groups = np.array_split(list(tab_to_plot.columns), 10)

for i, group in enumerate(column_groups):
    make_and_save_pariplot_check(tab_to_plot, list(group), f'plot_of_group_{i}')

## Wszystkie ze statusem porowatosci

In [None]:
porowatosc_df = check_df[check_df['rodzaj_uszkodzenia_DMC'] == "Odlewnicze - porowatości"]
porowatosc_shape = porowatosc_df.shape[0]
ok_df = check_df[check_df['our_final_status'] == 1].sample(porowatosc_shape)
porowatosc_and_ok = pd.concat([porowatosc_df, ok_df], ignore_index=True)

In [None]:
columns_needed = list(data_october_10_10_v1.columns)
columns_to_drop = porowatosc_and_ok.columns.difference(columns_needed)
porowatosc_and_ok = porowatosc_and_ok.drop(columns=columns_to_drop)

In [None]:
column_groups = np.array_split(list(porowatosc_and_ok.columns), 10)

for i, group in enumerate(column_groups):
    make_and_save_pariplot_check(porowatosc_and_ok, list(group), f'plot_porowatosc_group_{i}')