In [None]:
# Analiza i modyfikacja danych
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ewaluacja
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import precision_score, classification_report
from sklearn.metrics import recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
# machine learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import balanced_accuracy_score

models = [DecisionTreeClassifier(class_weight='balanced'), RandomForestClassifier(class_weight='balanced'), KNeighborsClassifier(), GaussianNB(),MLPClassifier(random_state=1, max_iter=300) ]
models_names = ['Tree', 'Random forest', 'K Neighbours', 'Naive Bayes', 'MLP']


def read_grouped_data():
    df = pd.read_csv('data/one_hour_data.csv')
    return df.drop(columns=df.columns[0], axis=1)


def calculate_feeling_rate_distribution(df):
    print('DATA:')
    count = df.count()[0]
    data = []
    numbers = df['feeling_rate'].unique()
    for i in numbers:
        occur = df['feeling_rate'].value_counts()[i]
        percent = (occur / count) * 100
        value = round(percent, 2)
        data.append(value)
        print(str(i) + ' ' + str(round(percent, 2)) + '%')
        
    fig, ax = plt.subplots()
    ax.pie(data, labels=numbers, autopct='%1.1f%%')


def print_df_information(df):
    print(df.head())
    print(df.info())
    print(df.describe())


def change_txt_data(df):
    print(df.describe(include=['O']))
    categoricals = list(df.select_dtypes(include=['O']).columns)
    encoder = OneHotEncoder(sparse_output=False)
    encoded = encoder.fit_transform(df[categoricals])
    train_ohe = pd.DataFrame(encoded, columns=np.hstack(encoder.categories_))
    df = pd.concat((df, train_ohe), axis=1).drop(categoricals, axis=1)
    print(df.head())
    return df

def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')


def print_balanced_accuracy(balanced_accuracy, balanced_method_name, y):
    plt.bar(models_names, balanced_accuracy)
    addlabels(models_names,balanced_accuracy)

    #giving title to the plot
    plt.title(balanced_method_name)
     
    # giving X and Y labels
    plt.xlabel("Models")
    plt.ylabel(y)
    plt.show


In [None]:

#train_df = read_raw_data()
train_df = read_grouped_data()
#train_df = change_txt_data(train_df)
Y = train_df['feeling_rate'].values
X = train_df.drop(['feeling_rate'], axis=1).values

sns.heatmap(train_df.corr(), annot=True)
plt.tight_layout()

calculate_feeling_rate_distribution(train_df)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=71830, stratify=Y)



In [None]:
calculate_feeling_rate_distribution(train_df)

In [None]:

balanced = []
accuracy = []
f1 = []
for model in models:
    print(str(model))
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)

    #Balanced acuurancy
    balanced_accurancy = balanced_accuracy_score(Y_test, Y_pred)
    balanced.append(round(balanced_accurancy, 2))
    print(balanced_accurancy)

    #Clasification report
    report = classification_report(Y_test, Y_pred, output_dict=True)
    accur = report['accuracy']
    accuracy.append(round(accur, 2))
    
    # macro_precision =  report['macro avg']['precision'] 
    # macro_recall = report['macro avg']['recall']    
    macro_f1 = report['macro avg']['f1-score']
    f1.append(round(macro_f1, 2))
    
    #metric = [weight_accuracy, macro_precision, macro_recall, macro_f1]
    #metrics.append(metric)

    #Confiusion Matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=model.classes_)
    disp.plot()


In [None]:
print_balanced_accuracy(f1, 'F1', 'F1 [%]')

In [None]:
print_balanced_accuracy(accuracy, 'Accuracy', 'Accuracy [%]')

In [None]:
print_balanced_accuracy(balanced, 'Balanced Accuracy',  'Balanced Accuracy [%]')

In [None]:
#IMBALANCED DATA
#USAGE OF RANDOM OVER SAMPLER

ros = RandomOverSampler(random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X_train, Y_train)

balanced_random_over_sampler = []

for model in models:
    print(str(model))
    model.fit(X_resampled, Y_resampled)
    Y_pred = model.predict(X_test)

    #Balanced acuurancy
    balanced_accurancy = balanced_accuracy_score(Y_test, Y_pred)
    balanced_random_over_sampler.append(round(balanced_accurancy, 2))
    print(balanced_accurancy)

    #Clasification report
    print(classification_report(Y_test, Y_pred))

    #Confiusion Matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=model.classes_)
    disp.plot()


In [None]:
print_balanced_accuracy(balanced_random_over_sampler, 'RandomOverSampler',  'Balanced Accuracy [%]')

In [None]:
#SMOTE

X_resampled, Y_resampled = SMOTE().fit_resample(X_train, Y_train)
balanced_smote = []

for model in models:
    print(str(model))
    model.fit(X_resampled, Y_resampled)
    Y_pred = model.predict(X_test)

    #Balanced acuurancy
    balanced_accurancy = balanced_accuracy_score(Y_test, Y_pred)
    balanced_smote.append(round(balanced_accurancy, 2))
    print(balanced_accurancy)

    #Clasification report
    print(classification_report(Y_test, Y_pred))

    #Confiusion Matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=model.classes_)
    disp.plot()


In [None]:
print_balanced_accuracy(balanced_smote, 'SMOTE', 'Balanced Accuracy [%]')

In [None]:
#ADASYN

X_resampled, Y_resampled = ADASYN().fit_resample(X_train, Y_train)
balanced_adasyn = []

for model in models:
    print(str(model))
    model.fit(X_resampled, Y_resampled)
    Y_pred = model.predict(X_test)

    #Balanced acuurancy
    balanced_accurancy = balanced_accuracy_score(Y_test, Y_pred)
    balanced_adasyn.append(round(balanced_accurancy, 2))
    print(balanced_accurancy)

    #Clasification report
    print(classification_report(Y_test, Y_pred))

    #Confiusion Matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=model.classes_)
    disp.plot()


In [None]:
print_balanced_accuracy(balanced_adasyn, 'ADASYN', 'Balanced Accuracy [%]')

In [None]:
balanced_bbc = []
for model in models:
    bbc = BalancedBaggingClassifier(base_estimator=model,
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)
 
    bbc.fit(X_resampled, Y_resampled)
    Y_pred = bbc.predict(X_test)

    #Balanced acuurancy
    balanced_accurancy = balanced_accuracy_score(Y_test, Y_pred)
    balanced_bbc.append(round(balanced_accurancy, 2))
    print(balanced_accurancy)

    #Clasification report
    print(classification_report(Y_test, Y_pred))

    #Confiusion Matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=model.classes_)
    disp.plot()
    


In [None]:
print_balanced_accuracy(balanced_bbc, 'BalancedBaggingClassifier', 'Balanced Accuracy [%]')
