In [None]:
import pandas as pd
import sqlalchemy
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sqlalchemy.exc import SQLAlchemyError
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from db_queries import username, password, dsn, dbhostname, service_name, dbtables, querys
from table_functions import *
from analyze_visualisation import *

In [None]:
data = {}

try:
    sqlalchemy_engine="oracle+cx_oracle://"+username+":"+password+"@"+dbhostname+"/?service_name="+service_name
    engine = sqlalchemy.create_engine(sqlalchemy_engine, arraysize=1000)
    for table, query in zip(dbtables, querys):
        data.update({table: pd.read_sql(query, engine)})
except SQLAlchemyError as e:
    print(e)

data = drop_unused_columns(data)

In [None]:
final_table = combine_final_table(data)
final_table = create_final_status(final_table)
final_table = drop_columns_not_used_in_ml(final_table)

In [None]:
final_table, categorized_columns = categorize_data(final_table)
save_df_to_csv(final_table, 'final_table_before_standarization.csv')
final_table = standarize_data(final_table)
save_df_to_csv(final_table, 'final_table_before_normalization.csv')
final_table = normalize_data(final_table, categorized_columns)
save_df_to_csv(final_table, 'final_table.csv')

In [None]:
final_table = read_csv('final_table.csv')

In [None]:
analyze_data(final_table)

In [None]:
ml_data = split_data(final_table)

In [None]:
final_table_9, final_table_10 = distinct_machine(final_table)

In [6]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.97      0.95      0.96     66070
         2.0       0.22      0.35      0.27      2734

    accuracy                           0.92     68804
   macro avg       0.59      0.65      0.61     68804
weighted avg       0.94      0.92      0.93     68804

[[62581  3489]
 [ 1776   958]]


In [4]:
clf = RandomForestClassifier(bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=300)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.97      0.98      0.98     66070
         2.0       0.40      0.25      0.31      2734

    accuracy                           0.96     68804
   macro avg       0.69      0.62      0.64     68804
weighted avg       0.95      0.96      0.95     68804

[[65045  1025]
 [ 2049   685]]


In [5]:
y_train = (y_train - 1).astype(int)
y_test = (y_test - 1).astype(int)
y_valid = (y_valid - 1).astype(int)

clf = XGBClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93     66070
           1       0.16      0.51      0.24      2734

    accuracy                           0.87     68804
   macro avg       0.57      0.70      0.58     68804
weighted avg       0.94      0.87      0.90     68804

[[58548  7522]
 [ 1346  1388]]
