In [None]:
import pandas as pd
import sqlalchemy
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sqlalchemy.exc import SQLAlchemyError
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *
from analyze_visualisation import *

In [None]:
path = "/home"
 
# Join various path components 
print(os.path.join(path, "User/Desktop", "file.txt"))

In [None]:
data = {}

try:
    sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
    engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
    for table, query in zip(dbtables, querys):
        data.update({table: pd.read_sql(query, engine)})
except SQLAlchemyError as e:
    print(e)

In [None]:
data = drop_unused_columns(data)

In [None]:
data.keys()

In [None]:
data['ONI_CIRCUITS'].shape

In [None]:
final_table = combine_final_table(data)

In [None]:
final_table.shape

In [None]:

final_table = create_final_status(final_table)
final_table = drop_columns_not_used_in_ml(final_table)

In [None]:
final_table.shape

In [None]:
final_table = read_csv('final_table_before_standarization.csv')

In [None]:
fig = px.box(final_table, y='vds_vac_hose1')
fig.update_layout(width=500, height=400, margin=dict(l=10, r=10, b=10, t=10), paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_traces(marker=dict(color='darkblue'), boxpoints='outliers')
fig.show()

In [None]:
fig.write_html(r"C:\Users\dlxpmx8\Desktop\Projekt_AI\meb_process_data_analysis\not_in_repo/box_plot_with_outliers.html")
fig.write_image(r"C:\Users\dlxpmx8\Desktop\Projekt_AI\meb_process_data_analysis\not_in_repo/box_plot_with_outliers.png", format="png")

In [None]:
final_table = read_csv('final_table_before_normalization.csv')

In [None]:
status_1_data = final_table[final_table['our_final_status'] == 0].sample(n=200, random_state= 69)
status_2_data = final_table[final_table['our_final_status'] == 1].sample(n=200, random_state= 69)
random_to_analyze = pd.concat([status_1_data, status_2_data], ignore_index=True)

col_dgm = ['cisnienie_koncowe','nachdruck_hub', 
                'anguss','vds_air_pressure','vds_vac_hose1', 'vds_vac_valve1', 'vds_vac_valve2']

make_and_save_pariplot(random_to_analyze, col_dgm, 'some_dgm_corr3.png')

In [None]:
final_table['our_final_status'].value_counts()

In [None]:
final_table = read_csv('final_table_before_normalization.csv')

In [None]:
final_table.corr()

In [None]:
correlation_matrix = final_table.corr()
threshold = 0.85  
high_corr_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            high_corr_features.add(colname)

In [None]:
pocz = [value for value in high_corr_features if value.startswith('flow')]
print(pocz)

In [None]:
'flow_13', 'flow_9', 'flow_2', 'flow_23'

In [None]:
print(len(high_corr_features))

In [None]:
final_table.shape

In [None]:
final_table_droped = final_table.drop(columns = high_corr_features)

In [None]:
final_table_droped.shape

In [None]:
save_df_to_csv(final_table_droped, 'final_table_droped_before_normalization.csv')

In [None]:
ml_data = split_data(final_table_droped)
ml_data['x_train'], scaler = normalize_data(ml_data['x_train'])
save_df_to_csv(ml_data['x_train'], 'x_train_droped.csv')
save_df_to_csv(ml_data['y_train'], 'y_train_droped.csv')
ml_data['x_valid'] = normalize_data(ml_data['x_valid'], scaler)
save_df_to_csv(ml_data['x_valid'], 'x_valid_droped.csv')
save_df_to_csv(ml_data['y_valid'], 'y_valid_droped.csv')
ml_data['x_test'] = normalize_data(ml_data['x_test'], scaler)
save_df_to_csv(ml_data['x_test'], 'x_test_droped.csv')
save_df_to_csv(ml_data['y_test'], 'y_test_droped.csv')

In [None]:
pocz = [value for value in final_table_droped if value.startswith('assigment') or value.startswith('working')]
print(pocz)

In [None]:
import pandas as pd
import sqlalchemy
from sqlalchemy.exc import SQLAlchemyError

from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *


def read_data_from_database1():
    print('Reading from database')
    data = {}

    try:
        sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
        engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
        for table, query in zip(dbtables, querys):
            data.update({table: pd.read_sql(query, engine)})
            print(f'Table {table} read')
    except SQLAlchemyError as e:
        print(e)

    data['MEB_DGM']['timestamp'] = pd.to_datetime(data['MEB_DGM']['timestamp'])
    filtered_data = data.copy()
    filtered_data['MEB_DGM'] = data['MEB_DGM'][(data['MEB_DGM']["timestamp"].dt.month >= 10) & (data['MEB_DGM']["timestamp"].dt.year >= 2023)]

    data['MEB_DGM'] = data['MEB_DGM'].sort_values(by='timestamp')
    data['MEB_DGM'] = data['MEB_DGM'].iloc[:-int(filtered_data.shape[0])]

    #data.drop(filtered_data, inplace= True)
    data = drop_unused_columns(data)
    filtered_data = drop_unused_columns(filtered_data)
    
    return data, filtered_data

In [None]:
data1, filtered1, amount1 = read_data_from_database1()

In [None]:
filtered1.shape[0]

In [None]:
data1['MEB_DGM'].shape

In [None]:
list(data1['MEB_DGM'].keys())

In [None]:
amount1

In [None]:
data1['MEB_DGM']['timestamp'].head()

In [None]:
data1['MEB_DGM'] = data1['MEB_DGM'].sort_values(by='timestamp')

In [None]:
data1['MEB_DGM']['timestamp'].head()

In [None]:
data1['MEB_DGM'].shape

In [None]:
data1['MEB_DGM'] = data1['MEB_DGM'].iloc[:-int(filtered1.shape[0])]

In [None]:
data1['MEB_DGM'].shape

In [None]:
data1.value_counts()

In [None]:
filtered1.value_counts()

In [None]:
data1.keys()

In [None]:
dgm = data1['MEB_DGM']

In [None]:
dgm.keys()

In [None]:
dgm['timestamp'][0]

In [None]:
print(data)

In [None]:

filtered_data = dgm[(dgm['timestamp'].dt.month >= 10) & (dgm['timestamp'].dt.year >= 2023)]

In [None]:
print(filtered_data.head())

In [None]:
filtered_data['id'].value_counts()

In [None]:
filtered_data.shape

In [None]:
from main import *

data = read_data_from_database()

In [None]:
data.keys()

In [None]:
data['MEB_DGM'].shape

In [None]:
for lol in data.values():
    print(lol.shape)

In [None]:
print('Combine final table')
final_table = combine_final_table1(data)

In [None]:
final_table.shape

In [None]:
final_table.head()

In [None]:
for lol in data.values():
    print(lol.shape)

In [None]:
from main import *

data = read_data_from_database()

print('Combine final table')
final_table = combine_final_table(data)

print('Create final status')
final_table = create_final_status(final_table)

print('Drop columns not used in ml')
final_table = drop_columns_not_used_in_ml(final_table)

print('Categorize data')
final_table = categorize_data(final_table)

#print('Drop columns with too much correlation')
#final_table = drop_columns_with_too_much_corr(final_table)

In [None]:
print(list(final_table.keys()))



In [None]:
final_table['data_odlania']

In [None]:
filtered_data = final_table['data_odlania'][(final_table['data_odlania'].dt.month >= 10) & (final_table['data_odlania'].dt.year >= 2023)]

In [None]:
filtered_data.shape[0]

In [None]:
final_table = final_table.iloc[:-int(filtered_data.shape[0])]

In [None]:
dropped_data = final_table.iloc[-int(filtered_data.shape[0]):]

In [None]:
dropped_data

In [None]:
dropped_data['our_final_status'].value_counts()

In [None]:
print(336/9889)

In [None]:
from main import *

In [None]:
data2 = load_csv('final_table_before_normalization.csv')
data = load_csv('test_data_from_october.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data2.shape

In [None]:
list1 = list(data.keys())
list2 = list(data2.keys())

In [None]:
list1