In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
df = pd.read_csv("../data/train_data-v1.csv")
labels = df["updated_label"].to_numpy()
embeddings = np.load("../data/embeddings.npy")

In [3]:
from sklearn.model_selection import train_test_split

train_emb, test_emb, train_label, test_label = train_test_split(embeddings, labels, test_size=0.15, random_state=42)

In [4]:
print("Training samples: ",train_emb.shape[0])
print("Testing samples: ",test_emb.shape[0])

Training samples:  204000
Testing samples:  36000


In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=15, weights='distance')

In [25]:
knn_classifier.fit(train_emb, train_label)

In [9]:
knn_classifier.score(test_emb, test_label)

0.8674166666666666

In [26]:
label = knn_classifier.predict(test_emb)

In [27]:
from sklearn.metrics import confusion_matrix, f1_score
confusion_matrix(test_label, label)

array([[ 9784,   611,  1480],
       [  481, 10081,  1366],
       [  476,   236, 11485]])

In [28]:
f1_score(test_label, label, average='micro')

np.float64(0.8708333333333333)

In [29]:
from sklearn.linear_model import LogisticRegression
lr_cls = LogisticRegression(verbose=True)

In [30]:
lr_cls.fit(train_emb, train_label)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         2307     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.09861D+00    |proj g|=  4.19255D-03


 This problem is unconstrained.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 2307     34     38      1     0     0   4.450D-05   9.687D-02
  F =   9.6871911534196498E-002

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [31]:
lr_cls.score(test_emb, test_label)

0.9923333333333333

In [32]:
res = lr_cls.predict(test_emb)

In [33]:
confusion_matrix(res, test_label)

array([[11768,    39,    28],
       [   37, 11791,     4],
       [   70,    98, 12165]])

In [34]:
import pickle
with open("../models/sentiment-classifier-lr.pkl", "wb") as f:
    pickle.dump(lr_cls, f)

In [35]:
df_test = pd.read_csv("../data/testdata.manual.2009.06.14.csv", names=["label", "id", "timestamp", "unk1", "user_id", "tweet"], encoding="utf-8", encoding_errors="ignore")


In [36]:
import re

regex_url_matching = "https?://\S+|www.\S+"
regex_userid_matching = "@\S+"
regex_extra_spaces = "\s{2,}"
regex_hashtag = "#\S+"
regex_non_chars_line = "^[\W_]+$"

def clean_row(row):
    row = re.sub(regex_url_matching, ' ', row)
    row = re.sub(regex_userid_matching, ' ', row)
    row = re.sub(regex_hashtag, " ", row)
    row = re.sub(regex_non_chars_line, " ", row)
    row = re.sub(regex_extra_spaces, " ", row)
    row = row.strip()   

    if not row:
        return pd.NA
    
    return row

In [37]:
df_test["tweet"] = df_test["tweet"].apply(lambda x: clean_row(x))

In [38]:
df_test.head()

Unnamed: 0,label,id,timestamp,unk1,user_id,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,I loooooooovvvvvveee my Kindle2. Not that the ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs is...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the ...it fucking rocks!!!"
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,You'll love your Kindle2. I've had mine for a ...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,Fair enough. But i have the Kindle2 and I thin...


In [41]:
sample = df.sample(n=20)
encoder = SentenceTransformer("thenlper/gte-base")

label_mapping = {
    0 : "negative",
    1 : "positive",
    2 : "neutral"
}

for _, row in sample.iterrows():
    s = row["tweet"]
    l = label_mapping[row["updated_label"]]
    emb = encoder.encode(s)
    pred = lr_cls.predict(emb.reshape(1, -1))
    pl = label_mapping[pred[0]]

    print(f"{s} | pred: {pl} | actual: {l} ")
    



is in chemistry and has found her text book thanks witworth haa, science crew haha | pred: positive | actual: positive 
I'm sorry to hear that. Hope she feels better soon! | pred: negative | actual: negative 
survived the SAT exam and I am soooo exhausted!!!!! | pred: neutral | actual: neutral 
i donno how to turn the flour into ambuyat! | pred: negative | actual: negative 
Do I see a Monday morning miss next week? | pred: neutral | actual: neutral 
Very curious about the house Sounds like you two are having a blast desiging it Have fun! XXXX | pred: positive | actual: positive 
Get 100 followers a day using Once you add everyone you are on the train or pay vip | pred: positive | actual: positive 
yeah! that's good thanks!! | pred: positive | actual: positive 
So glad I got to tweet with you!!! I'll let you know when I blog about my mozzarella sticks &amp; your breadcrumbs | pred: positive | actual: positive 
Oh, I know! I was watching with the happiest little face | pred: positive | a

In [43]:
predicted_labels = []
actual_labels = []
batch_size = 100
num_batches = df_test.shape[0]//batch_size

train_label_mapping = {
    0 : "negative",
    1 : "positive",
    2 : "neutral"
}

test_label_mapping = {
    "negative" : 0,
    "neutral" : 2,
    "positive": 4,
}

def train2test_labels(labels):
    nlabels = []
    for l in labels:
        nlabels.append(test_label_mapping[train_label_mapping[l]])
    return nlabels


for idx in range(num_batches):
    batch = df_test.iloc[idx*batch_size:batch_size*(idx+1)]
    sentences = batch["tweet"].to_list()
    emb = encoder.encode(sentences)
    preds = lr_cls.predict(emb)
    predicted_labels += train2test_labels(preds)
    actual_labels += batch["label"].to_list()

    # break
    print(f"done batch {idx}/{num_batches}")

done batch 0/4
done batch 1/4
done batch 2/4
done batch 3/4


In [44]:
f1_score(predicted_labels, actual_labels, average="weighted")

np.float64(0.6902715661977158)

In [45]:
confusion_matrix(predicted_labels, actual_labels)

array([[122,  18,  13],
       [ 21,   5,  12],
       [  7,  71, 131]])

## With PCA embeddings

In [102]:
from sklearn.decomposition import PCA

In [131]:
pca = PCA(n_components=35)
embeddings_pca = pca.fit_transform(train_emb)


In [132]:
embeddings_pca_test = pca.transform(test_emb)

In [133]:
lr_cls_pca = LogisticRegression(verbose=True)

In [134]:
lr_cls_pca.fit(embeddings_pca, train_label)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          108     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.09861D+00    |proj g|=  3.14796D-02


 This problem is unconstrained.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  108     21     25      1     0     0   8.838D-05   9.918D-02
  F =   9.9181999767212412E-002

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [135]:
lr_cls_pca.score(embeddings_pca_test, test_label)

0.9900555555555556

In [136]:
predicted_labels = []
actual_labels = []
batch_size = 100
num_batches = df_test.shape[0]//batch_size

train_label_mapping = {
    0 : "negative",
    1 : "positive",
    2 : "neutral"
}

test_label_mapping = {
    "negative" : 0,
    "neutral" : 2,
    "positive": 4,
}

def train2test_labels(labels):
    nlabels = []
    for l in labels:
        nlabels.append(test_label_mapping[train_label_mapping[l]])
    return nlabels


for idx in range(num_batches):
    batch = df_test.iloc[idx*batch_size:batch_size*(idx+1)]
    sentences = batch["tweet"].to_list()
    emb = encoder.encode(sentences)
    in_ = pca.transform(emb)
    preds = lr_cls_pca.predict(in_)
    predicted_labels += train2test_labels(preds)
    actual_labels += batch["label"].to_list()

    # break
    print(f"done batch {idx}/{num_batches}")

done batch 0/4
done batch 1/4
done batch 2/4
done batch 3/4


In [137]:
f1_score(predicted_labels, actual_labels, average="weighted")

np.float64(0.6925530656691768)

In [138]:
confusion_matrix(predicted_labels, actual_labels)

array([[123,  16,  13],
       [ 21,   5,  12],
       [  6,  73, 131]])

In [141]:
k = lr_cls_pca.predict_proba(in_)

In [147]:
k[0][0].item()

0.21606789884289782