In [1]:
import pandas as pd
import sqlalchemy
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sqlalchemy.exc import SQLAlchemyError
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *
from analyze_visualisation import *
from decision_tree import *

In [2]:
data = {}

try:
    sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
    engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
    for table, query in zip(dbtables, querys):
        data.update({table: pd.read_sql(query, engine)})
except SQLAlchemyError as e:
    print(e)

data = drop_unused_columns(data)

In [3]:
final_table = combine_final_table(data)
final_table = create_final_status(final_table)
final_table = drop_columns_not_used_in_ml(final_table)

our_final_status
1.0    662621
2.0     27448
Name: count, dtype: int64


In [4]:
final_table, categorized_columns = categorize_data(final_table)
save_df_to_csv(final_table, 'final_table_before_standarization.csv')
final_table = standarize_data(final_table)
save_df_to_csv(final_table, 'final_table_before_normalization.csv')

In [13]:
final_table = read_csv('final_table_before_normalization.csv')

In [None]:
categorized_columns.pop()

In [8]:
ml_data = split_data(final_table)
for name in ['x_train', 'x_valid', 'x_test']:
    ml_data[f'{name}'].drop(columns=categorized_columns, inplace=True)

In [9]:
categorized_columns = list(ml_data['x_train'].iloc[:, 130:].columns)

In [12]:
ml_data['x_train']

Unnamed: 0,czas_fazy_1,czas_fazy_2,czas_fazy_3,max_predkosc,cisnienie_tloka,cisnienie_koncowe,nachdruck_hub,anguss,oni_temp_curr_f1,oni_temp_fore_f1,...,working_mode_22_3.0,working_mode_23_2.0,working_mode_24_2.0,working_mode_25_1.0,working_mode_25_2.0,working_mode_25_3.0,working_mode_26_1.0,working_mode_26_2.0,working_mode_27_2.0,working_mode_27_3.0
55193,0.792088,-0.087790,0.390335,0.951309,0.884654,0.403559,0.663205,-0.458520,0.633768,0.956407,...,0,1,1,0,0,0,0,1,0,1
599943,-0.573734,0.143032,-0.223749,1.268504,-0.529320,1.409123,0.094591,0.267574,-0.184722,0.034896,...,0,1,1,0,0,0,0,1,0,0
56438,0.563658,-0.080795,0.405881,0.998301,0.570438,0.494974,0.663205,-0.318887,0.561970,0.807776,...,0,1,1,0,0,0,0,1,0,1
506635,-0.497590,-0.115768,-0.044965,0.140701,0.256221,-0.190638,-0.273336,-0.235106,-0.012408,-0.113735,...,0,1,1,0,0,0,0,1,1,0
425660,0.406612,-0.073801,-0.169337,-1.034093,-0.372212,-0.282053,-0.239888,-0.235106,-0.026768,-0.202913,...,0,1,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71670,1.320333,-0.157736,-0.107151,-1.034093,0.256221,0.494974,-0.273336,-0.123400,-0.773460,-1.258192,...,0,1,1,0,0,1,0,1,1,0
244104,0.125834,-0.066806,0.436974,0.692855,-0.529320,0.037899,-0.005753,-0.430593,-0.902696,-0.767710,...,0,1,1,0,0,0,0,1,0,0
533439,-0.040730,-0.115768,-0.122698,0.140701,-0.215104,0.266437,-0.273336,-0.039620,-0.112925,-0.277229,...,0,1,1,0,0,0,0,1,1,0
286455,1.196600,-0.157736,-0.138244,-1.034093,0.099113,0.860633,-0.273336,-0.179253,0.518892,0.644282,...,0,1,1,0,0,0,0,1,1,0


In [10]:
ml_data['x_train'], scaler = normalize_data(ml_data['x_train'], categorized_columns)
save_df_to_csv(ml_data['x_train'], 'x_train.csv')
save_df_to_csv(ml_data['y_train'], 'y_train.csv')
ml_data['x_valid'] = normalize_data(ml_data['x_valid'], categorized_columns, scaler)
save_df_to_csv(ml_data['x_valid'], 'x_valid.csv')
save_df_to_csv(ml_data['y_valid'], 'y_valid.csv')
ml_data['x_test'] = normalize_data(ml_data['x_test'], categorized_columns, scaler)
save_df_to_csv(ml_data['x_test'], 'x_test.csv')
save_df_to_csv(ml_data['y_test'], 'y_test.csv')

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- working_mode_27_3.0


In [None]:
analyze_data(final_table)

In [None]:
final_table_9, final_table_10 = distinct_machine(final_table)

In [43]:
# drzewa decyzyjne

clf = create_decision_tree_model(ml_data['x_train'], ml_data['y_train'])
print_decision_tree_stats(clf, ml_data['x_test'], ml_data['y_test'])

ValueError: Shape of passed values is (2, 2), indices imply (1, 1)

In [39]:
# las losowy

clf = RandomForestClassifier(bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=300)
clf.fit(ml_data['x_train'], ml_data['y_train'])
y_pred = clf.predict(ml_data['x_test'])
print(classification_report(ml_data['y_test'], y_pred))
print(confusion_matrix(ml_data['y_test'], y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     66251
           1       0.41      0.26      0.32      2745

    accuracy                           0.96     68996
   macro avg       0.69      0.62      0.65     68996
weighted avg       0.95      0.96      0.95     68996

[[65218  1033]
 [ 2036   709]]


In [40]:
# XGBoost

clf = XGBClassifier()
clf.fit(ml_data['x_train'], ml_data['y_train'])
y_pred = clf.predict(ml_data['x_test'])
print(classification_report(ml_data['y_test'], y_pred))
print(confusion_matrix(ml_data['y_test'], y_pred))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93     66251
           1       0.16      0.51      0.24      2745

    accuracy                           0.87     68996
   macro avg       0.57      0.70      0.59     68996
weighted avg       0.94      0.87      0.90     68996

[[58748  7503]
 [ 1346  1399]]
