In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import pprint
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle

from IPython import display
from matplotlib import pyplot as plt

%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv(
    'data/training.1600000.processed.noemoticon.csv',
    names=["sentiment", "id", "date", "flag", "user", "text"],
    encoding="ISO-8859-1",
)

# sentiment ==  4 --> positive
#           ==  2 --> neutral
#           ==  0 --> negative
#           == -1 --> no label

In [3]:
# Drop unused columns
df = df[['sentiment', 'text']]

# Shuffle because dataset is sorted
df = shuffle(df, random_state=2137)

In [4]:
# Limit to 10k
df = df.head(10_000)
df.describe()

Unnamed: 0,sentiment
count,10000.0
mean,1.9704
std,1.999881
min,0.0
25%,0.0
50%,0.0
75%,4.0
max,4.0


In [5]:
vectorizer = CountVectorizer(
    input='content',
    lowercase=True,
    analyzer='word',
    ngram_range=(1, 2),
    strip_accents='unicode',
)
X = vectorizer.fit_transform(df['text']).toarray()
X_no_vectorized = df['text'].to_numpy()
y = df['sentiment'].to_numpy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Remove labels in training set
y_train[:] = -1

print(f'{X_train.shape=} {X_test.shape=}')
print(f'{y_train.shape=} {y_test.shape=}')

X_train.shape=(7500, 91386) X_test.shape=(2500, 91386)
y_train.shape=(7500,) y_test.shape=(2500,)


In [7]:
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    query_strategy=uncertainty_sampling
)

In [None]:
def show_acc_plot(accuracy_collection):
    plt.plot

In [None]:
label_n = 10
for _ in range(label_n):
    print(f'Yet unlabeled: {len(X_train)}, {len(y_train)}')
    
    query_idx, query_inst = learner.query(X_train)

    print(X_no_vectorized[query_idx])

    # Manually assign class
    user_input = int(input())

    # Teach with manually assigned class
    learner.teach(query_inst, np.array([user_input]))

    # Remove labeled
    X_train = np.delete(X_train, query_idx, axis=0)
    y_train = np.delete(y_train, query_idx, axis=0)

    accuracy = learner.score(X_test, y_test)
    print(f'Accuracy: {accuracy}')

Yet unlabeled: 7500, 7500
["really miss my vaporizer  For all u stoners, i highly recommend vapolution vaporizers. But don't leave them in the car in desert heat ;)"]
