In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import pprint
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from IPython import display
from matplotlib import pyplot as plt

%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv(
    'data/tweets.csv',
    encoding="utf-8",
)

In [3]:
df['sentiment'] = -1
df = df[['text', 'sentiment']]
df

Unnamed: 0,text,sentiment
0,@angel_shark77 @Shibtoken 🎗️UPCOMING LAUNCH SOON🎗️💥 @TeacherDoge THE NEW BSC 100X GEMS!🔥 UTILITY:🏅TEACHER DOGE SWAP🎖️TEACHER DOGE SNIPER BOT⭐ TG Global 🌐 : https://t.co/3h3DEmmbFo#Binance #BNB #ETH #BITCOIN #DOGE #MEMETECH #BABYDOGE #SHIBA #TEACHERDOGE,-1
1,"@Zetlydotio #Bitcoin could be on its way to $58K in the coming weeks as it breaks resistance levels from the symmetrical triangle and support range of $45.5k.Y’all follow @kimcaden9, she is an underrrated Bitcoiner, her tweets &amp; tips have been helpful so far.",-1
2,@PeteSessions ALARM! 🚨🚨 There is one share that will change your life forever! AND NASDAQ Listing is soon! BNXA is the paypal of #crypto and #bitcoin #btc #eth #ethereum #doge https://t.co/IqVlClE52h,-1
3,"Finally, some action. #BTC RR: 5.7#BItcoin #cryptotrading https://t.co/JCgKZKhvxP",-1
4,@cryptolissimo $CYLUM will be available for sale on the Cylum DAPPS &amp; Binance Smart Chain Network. @Cylumfinancehttps://t.co/5d96L3IwuG#Cylumfinance#BitcoinButton #bitcoin #BSCGems #binance #NFTs #Airdrops #mining #cryptocurrency #cryptocurrencies #BNBChain,-1
...,...,...
9994,YoBit Farming: Earn up to 427% APY: https://t.co/E2iUAWe5kr #bitcoin #etherum,-1
9995,13393) Current #Bitcoin Price is $47131 #BTC #CryptoIndicators Daily:RSI: 68.2MA(20): 42874MA(50): 41443MA(200): 48295Bollinger B. lower/upper: 36926/48821#Ethereum Price is $3407 #ETHRSI: 72.6MA(20): 2985MA(50): 2856MA(200): 3488BB: 2424/3546#VAIOT Price: $0.062,-1
9996,@WatcherGuru $CYLUM will be available for sale on the Cylum DAPPS &amp; Binance Smart Chain Network. @Cylumfinancehttps://t.co/TXVWFhFXQh#Cylumfinance#BitcoinButton #bitcoin #BSCGems #binance #NFTs #Airdrops #mining #cryptocurrency #cryptocurrencies #BNBChain,-1
9997,The only one who can save #Ukraine is#Russia #Kiev #Kyiv #Bitcoin,-1


In [4]:
vectorizer = CountVectorizer(
    input='content',
    lowercase=True,
    analyzer='word',
    ngram_range=(1, 2),
    strip_accents='ascii',
)
X = vectorizer.fit_transform(df['text']).toarray()

In [5]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    query_strategy=uncertainty_sampling,
)

In [7]:
def show_acc_plot(accuracies_list):
    with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 3))
        plt.plot(range(len(accuracies_list)), accuracies_list)
        plt.scatter(range(len(accuracies_list)), accuracies_list)
        display.display(plt.gcf())
        plt.close('all')

In [8]:
accuracies = []

In [None]:
label_n = 50

for _ in range(label_n):
    display.clear_output(wait=True)

    print(f'Yet unlabeled: {X.shape}')
    
    query_idx, query_inst = learner.query(
        X,
        random_tie_break=True,
        n_instances=1,
    )

    display.display(df['text'][query_idx])
    
    try:
        user_input = int(input())
    except:
        continue

    # Teach with manually assigned class
    learner.teach(np.array(query_inst), [user_input])
    df.loc[query_idx, 'sentiment'] = user_input

    # Remove labeled
    X = np.delete(X, query_idx, axis=0)

Yet unlabeled: (9983, 81821)


1730    Farmerfun found #bitcoin in a User vault at this location! Join me playing #coinhuntworld, It's awesome! https://t.co/evBYMvYxKm #cryptocurrency #14810 https://t.co/gV18CHw1mf
Name: text, dtype: object