In [None]:
import pandas as pd
import sqlalchemy
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sqlalchemy.exc import SQLAlchemyError
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *
from analyze_visualisation import *
from decision_tree import *

In [None]:
data = {}

try:
    sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
    engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
    for table, query in zip(dbtables, querys):
        data.update({table: pd.read_sql(query, engine)})
except SQLAlchemyError as e:
    print(e)

data = drop_unused_columns(data)

In [None]:
final_table = combine_final_table(data)
final_table = create_final_status(final_table)
final_table = drop_columns_not_used_in_ml(final_table)

In [None]:
final_table, categorized_columns = categorize_data(final_table)
save_df_to_csv(final_table, 'final_table_before_standarization.csv')
final_table = standarize_data(final_table)
save_df_to_csv(final_table, 'final_table_before_normalization.csv')

In [None]:
final_table = read_csv('final_table_before_normalization.csv')

In [None]:
categorized_columns.pop()

In [None]:
ml_data = split_data(final_table)
for name in ['x_train', 'x_valid', 'x_test']:
    ml_data[f'{name}'].drop(columns=categorized_columns, inplace=True)

In [None]:
categorized_columns = list(ml_data['x_train'].iloc[:, 130:].columns)

In [None]:
ml_data['x_train']

In [None]:
ml_data['x_train'], scaler = normalize_data(ml_data['x_train'], categorized_columns)
save_df_to_csv(ml_data['x_train'], 'x_train.csv')
save_df_to_csv(ml_data['y_train'], 'y_train.csv')
ml_data['x_valid'] = normalize_data(ml_data['x_valid'], categorized_columns, scaler)
save_df_to_csv(ml_data['x_valid'], 'x_valid.csv')
save_df_to_csv(ml_data['y_valid'], 'y_valid.csv')
ml_data['x_test'] = normalize_data(ml_data['x_test'], categorized_columns, scaler)
save_df_to_csv(ml_data['x_test'], 'x_test.csv')
save_df_to_csv(ml_data['y_test'], 'y_test.csv')

In [None]:
analyze_data(final_table)

In [None]:
final_table_9, final_table_10 = distinct_machine(final_table)

In [None]:
# drzewa decyzyjne

clf = create_decision_tree_model(ml_data['x_train'], ml_data['y_train'])
print_decision_tree_stats(clf, ml_data['x_test'], ml_data['y_test'])

In [None]:
# las losowy

clf = RandomForestClassifier(bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=300)
clf.fit(ml_data['x_train'], ml_data['y_train'])
y_pred = clf.predict(ml_data['x_test'])
print(classification_report(ml_data['y_test'], y_pred))
print(confusion_matrix(ml_data['y_test'], y_pred))

In [None]:
# XGBoost

clf = XGBClassifier()
clf.fit(ml_data['x_train'], ml_data['y_train'])
y_pred = clf.predict(ml_data['x_test'])
print(classification_report(ml_data['y_test'], y_pred))
print(confusion_matrix(ml_data['y_test'], y_pred))