In [None]:
%load_ext autoreload
%autoreload 2
%pylab inline

import os 
from linalgo.client import LinalgoClient

In [None]:
linalgo_client = LinalgoClient(token="09e20583f39726773bf506a6abe4a6b005d8e507")

In [None]:
tasks = linalgo_client.get_tasks()
for task in tasks:
    print(f"id: {task.id}, name: {task.name}")

In [None]:
entities = task.entities
for entity in entities:
    print(f"id: {entity['id']}, name: {entity['title']}")

In [None]:
label = 7
data, target = [], []
for task in tasks[:-1]:
    docs, labels = task.transform(target='binary',  label=label)
    data.extend(docs)
    target.extend(labels)

In [None]:
print(f"number of docs: {len(data)}")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=43)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

text_clf.fit(X_train, y_train)
y_score = text_clf.decision_function(X_test)

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
fpr, tpr, thres = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
from linalgo.annotate import Annotator

In [None]:
task = linalgo_client.get_task(46)
print(f"# docs: {len(task.documents)}")

In [None]:
annotator = Annotator(name='rob v1', model=text_clf, annotation_type_id=label, threshold=0)
annotator.assign_task(task)

In [None]:
r = []
for doc in task.documents:
    annotation = annotator._get_annotation(doc)
    if annotation.type_id != -1:
        l = "YES"
    else:
        l = "NO"
    r.append({'doc': doc.content, 'label': l, 'score': annotation.score})

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [None]:
d = pd.DataFrame(r)

In [None]:
d.loc[d.score > -.5, :]