In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from text_analyzer.src.metrics import binary_classification as bin_metrics
from text_analyzer.src.metrics import multiclass_classification as multi_metrics
from text_analyzer.src.models.baselines import MulticlassClassifier

In [2]:
path = '../../static/datasets/modified/'
folders = {'bin' : 'bin_classification/', 'multi' : 'multi_classification/'}

In [3]:
#Dummy Classifier
from text_analyzer.src.models.baselines import DummyClassifier as Dummy_clf

dummy = Dummy_clf()
train_data = pd.read_csv(path + folders['bin'] + 'train_data.csv')
X_train, y_train = train_data.iloc[:, :-1], train_data.label
dummy.fit(X_train, y_train)

val_data = pd.read_csv(path + folders['bin'] + 'validation_data.csv')
X_val, y_val = val_data.iloc[:, :-1], val_data.label

y_pred = dummy.predict(X_val)
print(
    bin_metrics.accuracy_score(y_true=y_val, y_pred=y_pred),
    bin_metrics.recall_score(y_true=y_val, y_pred=y_pred),
    bin_metrics.precision_score(y_true=y_val, y_pred=y_pred),
)


dummy = Dummy_clf()
train_data = pd.read_csv(path + folders['multi'] + 'train_data.csv')
X_train, y_train = train_data.iloc[:, :-1], train_data.label
dummy.fit(X_train, y_train)

val_data = pd.read_csv(path + folders['multi'] + 'validation_data.csv')
X_val, y_val = val_data.iloc[:, :-1], val_data.label

y_pred = dummy.predict(X_val)
print(
    multi_metrics.accuracy_score(y_true=y_val, y_pred=y_pred),
    multi_metrics.recall_score(y_true=y_val, y_pred=y_pred),
    multi_metrics.precision_score(y_true=y_val, y_pred=y_pred)
)

0.52375 0.494475138121547 0.47480106100795755
0.69302870533099 0.549673427294603 0.549673427294603


In [4]:
#Naive Bayes
from text_analyzer.src.models.baselines import NaiveBayesClassifier as NB

nb = NB(mode='cat_features')
X_train, y_train = pd.read_csv(path + folders['bin'] + 'bayes_train.csv'), pd.read_csv(path + folders['bin'] + 'bayes_test.csv')
nb.fit(X_train, y_train)

X_val, y_val = X_train, y_train

y_pred = nb.predict(X_val)
print(
    bin_metrics.accuracy_score(y_true=y_val, y_pred=y_pred),
    bin_metrics.recall_score(y_true=y_val, y_pred=y_pred),
    bin_metrics.precision_score(y_true=y_val, y_pred=y_pred),
)

nb = NB(mode='cat_features')
X_train, y_train = pd.read_csv(path + folders['multi'] + 'bayes_train.csv'), pd.read_csv(path + folders['multi'] + 'bayes_test.csv')
nb.fit(X_train, y_train)

X_val, y_val = X_train, y_train

y_pred = nb.predict(X_val)
print(
    multi_metrics.accuracy_score(y_true=y_val, y_pred=y_pred),
    multi_metrics.recall_score(y_true=y_val, y_pred=y_pred),
    multi_metrics.precision_score(y_true=y_val, y_pred=y_pred),
)

0it [00:00, ?it/s]


KeyError: 1.0

In [3]:
from sklearn.linear_model import LogisticRegression

#Binary
logreg = LogisticRegression(C=2, penalty='l2', solver='saga')
train_data = pd.read_csv('../../static/embeddings/bin_average_train.csv')
X_train, y_train = train_data.iloc[:, :-1], train_data.label
logreg.fit(X_train, y_train)

val_data = pd.read_csv('../../static/embeddings/bin_average_val.csv')
X_val, y_val = val_data.iloc[:, :-1], val_data.label

y_proba = logreg.predict_proba(X_val)
print(bin_metrics.roc_auc_score(y_true=y_val, y_proba=y_proba))

#Multiclass
train_data = pd.read_csv('../../static/embeddings/multi_average_train.csv')
X_train, y_train = train_data.iloc[:, :-1], train_data.label

val_data = pd.read_csv('../../static/embeddings/multi_average_val.csv')
X_val, y_val = val_data.iloc[:, :-1], val_data.label

strategies = ["one-vs-all", "all-vs-all"]
for strategy in strategies:
    multi_clf = MulticlassClassifier(LogisticRegression, strategy)
    multi_clf.fit(X_train, y_train, target_name='label')
    y_pred = multi_clf.predict(X_val)
    print(
        multi_metrics.accuracy_score(y_true=y_val, y_pred=y_pred),
        multi_metrics.recall_score(y_true=y_val, y_pred=y_pred),
        multi_metrics.precision_score(y_true=y_val, y_pred=y_pred)
    )

0.4999999999999991
0.7923842999414177 0.6927877947295423 0.6927877947295423


2845it [00:00, 44977.65it/s]

0.7994141769185706 0.7060439560439561 0.7060439560439561





In [None]:
# %env CLEARML_WEB_HOST=https://app.clear.ml
# %env CLEARML_API_HOST=https://api.clear.ml
# %env CLEARML_FILES_HOST=https://files.clear.ml
# %env CLEARML_API_ACCESS_KEY=VT1X7EH5UU1REKI7T44X
# %env CLEARML_API_SECRET_KEY=NHZDqveny2UDyuTvhVADxwUOAgp2mVv3wyipHnKkCc42hiaJog

In [None]:
# from clearml import Task, Logger

In [None]:
# task = Task.init(
#     project_name='tonality analysis',
#     task_name='TestBaselines',
#     tags=['NaiveBayesClassifier', 'DummyClassifier'])

In [None]:
# task.upload_artifact(name='train_data_preprocessed_for_NB', artifact_object='../../../static/datasets/modified/bin_classification/bayes_train.csv')
# task.upload_artifact(name='test_data_preprocessed_for_NB', artifact_object='../../../static/datasets/modified/bin_classification/bayes_test.csv')

In [None]:
# task.close()