In [1]:
# all we need to import, including model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# read data from dataset
train_full = pd.read_pickle(r'./tweets-data/train.pkl')
train_embed = pd.read_pickle(r'./sentence-transformers/train_emb.pkl')
train_tfidf = pd.read_pickle(r'./tfidf/train_tfidf.pkl')


dev_full = pd.read_pickle(r'./tweets-data/dev.pkl')
dev_embed = pd.read_pickle(r'./sentence-transformers/dev_emb.pkl')
dev_tfidf = pd.read_pickle(r'./tfidf/dev_tfidf.pkl')

                               

test_full = pd.read_pickle(r'./tweets-data/test.pkl')
test_embed = pd.read_pickle(r'./sentence-transformers/test_emb.pkl')
test_tfidf = pd.read_pickle(r'./tfidf/test_tfidf.pkl')
                             
unlabel_full  = pd.read_pickle(r'./tweets-data/unlabeled.pkl')
unlabel_embed = pd.read_pickle(r'./sentence-transformers/unlabeled_emb.pkl')
unlabel_tfidf = pd.read_pickle(r'./tfidf/unlabeled_tfidf.pkl')
                               

In [3]:
# get the label of the data
train_label = train_embed['Sentiment']
dev_label = dev_embed['Sentiment']
test_lable = test_embed['Sentiment']

train_group = train_embed['Demographic']
dev_group = dev_embed['Demographic']

In [30]:
# get the tfidf from the embedded data
train_embed_tfidf = pd.DataFrame(train_embed.TFIDF.to_list())
dev_embed_tfidf = pd.DataFrame(dev_embed.TFIDF.to_list())
test_embed_tfidf = pd.DataFrame(test_embed.TFIDF.to_list())

In [5]:
# get the text from full data
train_full_tfidf = pd.DataFrame(train_full.text.to_list())
dev_full_tfidf = pd.DataFrame(dev_full.text.to_list())
test_full_tfidf = pd.DataFrame(test_full.text.to_list())

In [6]:
# Choose best K in KNN --> finally, choose K=46
K = [0]
Acc = [0]
for i in range (1,100):
    knn = KNeighborsClassifier(i)
    knn.fit(train_embed_tfidf, train_label)
    knn_pred = knn.predict(dev_embed_tfidf)
    print("i = ", i, ":", accuracy_score(dev_label, knn_pred))
    


i =  1 : 0.621
i =  2 : 0.6065
i =  3 : 0.64225
i =  4 : 0.64
i =  5 : 0.66275
i =  6 : 0.657
i =  7 : 0.67625
i =  8 : 0.6665
i =  9 : 0.676
i =  10 : 0.668
i =  11 : 0.68075
i =  12 : 0.6755
i =  13 : 0.67825
i =  14 : 0.67725
i =  15 : 0.67775
i =  16 : 0.68025
i =  17 : 0.68575
i =  18 : 0.6805
i =  19 : 0.685
i =  20 : 0.68025
i =  21 : 0.68425
i =  22 : 0.68675
i =  23 : 0.68475
i =  24 : 0.68575
i =  25 : 0.68325
i =  26 : 0.68425
i =  27 : 0.68325
i =  28 : 0.68375
i =  29 : 0.686
i =  30 : 0.6855
i =  31 : 0.69025
i =  32 : 0.691
i =  33 : 0.69075
i =  34 : 0.68725
i =  35 : 0.69075
i =  36 : 0.68725
i =  37 : 0.68875
i =  38 : 0.68975
i =  39 : 0.6905
i =  40 : 0.68725
i =  41 : 0.68725
i =  42 : 0.68825
i =  43 : 0.6875
i =  44 : 0.6875
i =  45 : 0.688
i =  46 : 0.6915
i =  47 : 0.68875
i =  48 : 0.68825
i =  49 : 0.68575
i =  50 : 0.685
i =  51 : 0.689
i =  52 : 0.68725
i =  53 : 0.69
i =  54 : 0.68625
i =  55 : 0.6885
i =  56 : 0.686
i =  57 : 0.6875
i =  58 : 0.6865
i =  

In [7]:
# baseline and GaussianNB, KNN, LogisticRegression model
baseline = DecisionTreeClassifier(max_depth=1)
baseline.fit(train_embed_tfidf,train_label)
base_pred = baseline.predict(dev_embed_tfidf)
print("1-R baseline accuracy:", accuracy_score(dev_label, base_pred))

gnb = GaussianNB()
gnb.fit(train_embed_tfidf, train_label)
gnb_pred = gnb.predict(dev_embed_tfidf)
print("GaussianNB accuracy", accuracy_score(dev_label, gnb_pred))

num_neigh = 46
knn = KNeighborsClassifier(num_neigh)
knn.fit(train_embed_tfidf, train_label)
knn_pred = knn.predict(dev_embed_tfidf)
print("KNeighborsClassifier", accuracy_score(dev_label, knn_pred))

lr = LogisticRegression(solver='lbfgs', max_iter = 800)
lr.fit(train_embed_tfidf, train_label)
lr_pred = lr.predict(dev_embed_tfidf)
print("LogisticRegression accuracy", accuracy_score(dev_label, lr_pred))

1-R baseline accuracy: 0.59925
GaussianNB accuracy 0.61475
KNeighborsClassifier 0.6915
LogisticRegression accuracy 0.69825


In [8]:
# add two new feature to the embeded data
import re
from textblob import TextBlob
train_embed_tfidf[384] = train_full.text.apply(lambda x: TextBlob(x).sentiment.subjectivity)
train_embed_tfidf[385] = train_full.text.apply(lambda x: TextBlob(x).sentiment.polarity)
dev_embed_tfidf[384] = dev_full.text.apply(lambda x: TextBlob(x).sentiment.subjectivity)
dev_embed_tfidf[385] = dev_full.text.apply(lambda x: TextBlob(x).sentiment.polarity)
test_embed_tfidf[384] = test_full.text.apply(lambda x: TextBlob(x).sentiment.subjectivity)
test_embed_tfidf[385] = test_full.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [9]:
# accuracy of baseline and three models after adding two new feature
baseline = DecisionTreeClassifier(max_depth=1)
baseline.fit(train_embed_tfidf,train_label)
base_pred = baseline.predict(dev_embed_tfidf)
print("1-R baseline accuracy:", accuracy_score(dev_label, base_pred))

gnb = GaussianNB()
gnb.fit(train_embed_tfidf, train_label)
gnb_pred = gnb.predict(dev_embed_tfidf)
print("GaussianNB accuracy", accuracy_score(dev_label, gnb_pred))

num_neigh = 46
knn = KNeighborsClassifier(num_neigh)
knn.fit(train_embed_tfidf, train_label)
knn_pred = knn.predict(dev_embed_tfidf)
print("K-NearestNeighborClassifier", accuracy_score(dev_label, knn_pred))

lr = LogisticRegression(solver='lbfgs', max_iter = 800)
lr.fit(train_embed_tfidf, train_label)
lr_pred = lr.predict(dev_embed_tfidf)
print("LogisticRegression accuracy", accuracy_score(dev_label, lr_pred))

1-R baseline accuracy: 0.59925
GaussianNB accuracy 0.6165
K-NearestNeighborClassifier 0.68875
LogisticRegression accuracy 0.6995


In [10]:
# prepare data for select N best feature
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_embed_pos = scaler.fit_transform(train_embed_tfidf)
dev_embed_pos = scaler.transform(dev_embed_tfidf)

In [11]:
#select N best feature (chi2 and mutual_info_classif) -- this is the example of the result by using KNN model
ks = [100, 200, 300, "all"]
num_neigh = 46

for k in ks:
    x2 = SelectKBest(chi2, k=k)
    train_new_chi2 = x2.fit_transform(train_embed_pos, train_label)
    test_new_chi2 = x2.transform(dev_embed_pos)
    xm = SelectKBest(mutual_info_classif, k=k)
    train_new_mutual = xm.fit_transform(train_embed_pos, train_label)
    test_new_mutual = xm.transform(dev_embed_pos)
    knn = KNeighborsClassifier(num_neigh)
    knn.fit(train_new_chi2, train_label)
    knn_pred = knn.predict(test_new_chi2)
    knn1 = KNeighborsClassifier(num_neigh)
    knn1.fit(train_new_mutual, train_label)
    knn_pred1 = knn1.predict(test_new_mutual)
    print(k, "chi2", "KNN accuracy", accuracy_score(dev_label, knn_pred))
    print(k, "mutual", "KNN accuracy", accuracy_score(dev_label, knn_pred1))

100 chi2 KNN accuracy 0.68475
100 mutual KNN accuracy 0.6925
200 chi2 KNN accuracy 0.68525
200 mutual KNN accuracy 0.68875
300 chi2 KNN accuracy 0.686
300 mutual KNN accuracy 0.69375
all chi2 KNN accuracy 0.687
all mutual KNN accuracy 0.687


In [12]:
#select N best feature (chi2 and mutual_info_classif) -- this is the example of the result by using GaussianNB model
ks = [100, 200, 300, "all"]
for k in ks:
    x2 = SelectKBest(chi2, k=k)
    train_new_chi2 = x2.fit_transform(train_embed_pos, train_label)
    test_new_chi2 = x2.transform(dev_embed_pos)
    xm = SelectKBest(mutual_info_classif, k=k)
    train_new_mutual = xm.fit_transform(train_embed_pos, train_label)
    test_new_mutual = xm.transform(dev_embed_pos)
    gnb = GaussianNB()
    gnb.fit(train_new_chi2, train_label)
    gnb_pred = gnb.predict(test_new_chi2)
    gnb1 = GaussianNB()
    gnb1.fit(train_new_mutual, train_label)
    gnb_pred1 = gnb1.predict(test_new_mutual)
    print(k, "chi2", "GaussianNB accuracy", accuracy_score(dev_label, gnb_pred))
    print(k, "mutual", "GaussianNB accuracy", accuracy_score(dev_label, gnb_pred1))

100 chi2 GaussianNB accuracy 0.61875
100 mutual GaussianNB accuracy 0.62725
200 chi2 GaussianNB accuracy 0.62
200 mutual GaussianNB accuracy 0.62325
300 chi2 GaussianNB accuracy 0.6185
300 mutual GaussianNB accuracy 0.6155
all chi2 GaussianNB accuracy 0.6165
all mutual GaussianNB accuracy 0.6165


In [13]:
#select N best feature (chi2 and mutual_info_classif) -- this is the example of the result by using LR model
ks = [100, 200, 300, "all"]
for k in ks:
    x2 = SelectKBest(chi2, k=k)
    train_new_chi2 = x2.fit_transform(train_embed_pos, train_label)
    test_new_chi2 = x2.transform(dev_embed_pos)
    xm = SelectKBest(mutual_info_classif, k=k)
    train_new_mutual = xm.fit_transform(train_embed_pos, train_label)
    test_new_mutual = xm.transform(dev_embed_pos)
    lr = LogisticRegression(solver='lbfgs', max_iter = 800)
    lr.fit(train_new_chi2, train_label)
    lr_pred = lr.predict(test_new_chi2)
    lr1 = LogisticRegression(solver='lbfgs', max_iter = 800)
    lr1.fit(train_new_mutual, train_label)
    lr_pred1 = lr1.predict(test_new_mutual)
    print(k, "chi2", "LogisticRegression accuracy", accuracy_score(dev_label, lr_pred))
    print(k, "mutual", "LogisticRegression accuracy", accuracy_score(dev_label, lr_pred1))

100 chi2 LogisticRegression accuracy 0.683
100 mutual LogisticRegression accuracy 0.6845
200 chi2 LogisticRegression accuracy 0.691
200 mutual LogisticRegression accuracy 0.68675
300 chi2 LogisticRegression accuracy 0.6965
300 mutual LogisticRegression accuracy 0.69575
all chi2 LogisticRegression accuracy 0.6975
all mutual LogisticRegression accuracy 0.6975


In [14]:
# clean the data from text
def clean(text):
    text = re.sub(r'@[A-Za-z0-9]+',"",text)
    text = re.sub(r'#','',text) 
    text = re.sub(r'_TWITTER-ENTITY_','',text)
    text = re.sub(r'RT[\s]+','',text)
    text = re.sub(r'https?:\/\/\S+','',text)
    return text

In [15]:
# clean the data from full data
def clean_text(test):
    k = 0
    for i in test.text:
        test.text[k] = clean(i)
        k += 1
    return test


In [16]:
# get the tfidf from unlabel embedded data
unlabel_embed_tfidf = pd.DataFrame(unlabel_embed.TFIDF.to_list())
unlabel_full_tfidf = pd.DataFrame(unlabel_full.text.to_list())

In [25]:
#get best knn accuracy after select N best feature

num_neigh = 46

xm = SelectKBest(mutual_info_classif, k=300)
train_new_mutual = xm.fit_transform(train_embed_pos, train_label)
test_new_mutual = xm.transform(dev_embed_pos)
knn = KNeighborsClassifier(num_neigh)
knn.fit(train_new_mutual, train_label)
knn_pred = knn.predict(test_new_mutual)
print("KNN accuracy", accuracy_score(dev_label, knn_pred))

KNN accuracy 0.6895


In [26]:
#get best GaussianNB accuracy after select N best feature

num_neigh = 46

xm = SelectKBest(mutual_info_classif, k=100)
train_new_mutual = xm.fit_transform(train_embed_pos, train_label)
test_new_mutual = xm.transform(dev_embed_pos)
gnb = GaussianNB()
gnb.fit(train_new_mutual, train_label)
gnb_pred = gnb.predict(test_new_mutual)
print("GNB accuracy", accuracy_score(dev_label, gnb_pred))

GNB accuracy 0.628


In [31]:
from sklearn.semi_supervised import SelfTrainingClassifier
base = GaussianNB()

self_training_model = SelfTrainingClassifier(base, threshold=0.8)
semi = pd.concat([train_embed_tfidf, unlabel_embed_tfidf]).reset_index(drop = True)
unlabel_lable = pd.Series([-1]*100000)
semi_label = pd.concat([train_label, unlabel_lable]).reset_index(drop = True)
self_training_model.fit(semi, semi_label)
gnb_pred = self_training_model.predict(dev_embed_tfidf)
print("GaussianNB semi supervised accuracy", accuracy_score(dev_label, gnb_pred))

GaussianNB semi supervised accuracy 0.588


In [32]:
from sklearn.semi_supervised import SelfTrainingClassifier
num_neigh = 46
base = KNeighborsClassifier(num_neigh)

self_training_model = SelfTrainingClassifier(base, threshold=0.8)
semi = pd.concat([train_embed_tfidf, unlabel_embed_tfidf]).reset_index(drop = True)
unlabel_lable = pd.Series([-1]*100000)
semi_label = pd.concat([train_label, unlabel_lable]).reset_index(drop = True)
self_training_model.fit(semi, semi_label)
knn_pred = self_training_model.predict(dev_embed_tfidf)
print("KNN semi supervised accuracy", accuracy_score(dev_label, knn_pred))

KNN semi supervised accuracy 0.664


In [33]:
from sklearn.semi_supervised import SelfTrainingClassifier
num_neigh = 46
base = LogisticRegression(solver='lbfgs', max_iter = 800)

self_training_model = SelfTrainingClassifier(base, threshold=0.8)
semi = pd.concat([train_embed_tfidf, unlabel_embed_tfidf]).reset_index(drop = True)
unlabel_lable = pd.Series([-1]*100000)
semi_label = pd.concat([train_label, unlabel_lable]).reset_index(drop = True)
self_training_model.fit(semi, semi_label)
lr_pred = self_training_model.predict(dev_embed_tfidf)
print("Logistic Regression semi supervised accuracy", accuracy_score(dev_label, lr_pred))

Logistic Regression semi supervised accuracy 0.70075


In [38]:
# for the mix of different method
from sklearn.ensemble import StackingClassifier
m1 = gnb
m2 = knn
m3 = LogisticRegression(solver='lbfgs', max_iter = 1000)
estimators = [("m1", m1), ("m2", m2), ("m3", m3)]
final = LogisticRegression(solver = "lbfgs", random_state = 1)
clf = StackingClassifier(estimators = estimators, final_estimator = final)
clf.fit(train_embed_tfidf, train_label).score(dev_embed_tfidf, dev_label)

0.7065

In [39]:
# get the predicted test label
kaggle = clf.predict(test_embed_tfidf)
kaggle = pd.DataFrame(kaggle,columns=["Category"])
kaggle.to_csv("submission.csv")