In [2]:
import pandas as pd
from gensim.models import KeyedVectors
from gensim.utils import tokenize
import gensim
from tqdm import tqdm
import numpy as np
import re

In [3]:
tweets_train = pd.read_csv('../../data/tweets_train2.tsv', sep='\t', converters={'target': str, 'id_str': str})
tweets_test = pd.read_csv('../../data/tweets_test2.tsv', sep='\t', converters={'target': str, 'id_str': str})

In [4]:
vector_model = KeyedVectors.load_word2vec_format('../../data/language_models/wiki.multi.pl.vec')

In [5]:
def clean_text(text):
  regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
  url_free = re.compile(regex).sub('', text)
  tokens = tokenize(url_free, lowercase=True)
  return ' '.join(list(tokens))

def filter_text(model, text):
  return len([word for word in tokenize(text, lowercase=True) if word in model]) > 0

In [7]:
tweets_train['clean_text'] = tweets_train.apply(lambda row: clean_text(row["full_text"]), axis=1)
tweets_train = tweets_train[tweets_train.apply(lambda row: filter_text(vector_model, row['clean_text']), axis=1)]

tweets_test['clean_text'] = tweets_test.apply(lambda row: clean_text(row["full_text"]), axis=1)
tweets_test = tweets_test[tweets_test.apply(lambda row: filter_text(vector_model, row['clean_text']), axis=1)]

In [8]:
def prepare(model, tweets, targets):
  embeddings = []
  targets_final = []
  for tweet, target in tqdm(zip(tweets, targets)):
    words = []
    for word in tokenize(tweet, lowercase=True):
      if word in model:
        vec = model.get_vector(word)
        norm = np.linalg.norm(vec)
        words.append(vec / norm)
    if len(words) > 0:
      words = np.asarray(words)
      embeddings.append(words.mean(axis=0))
      targets_final.append(target)

  return np.asarray(embeddings), np.asarray(targets_final)

In [10]:
from sklearn.preprocessing import LabelEncoder

le_train = LabelEncoder()
le_train.fit(tweets_train["target"])
targets_train = tweets_train["target"].apply(lambda x: le_train.transform([x])[0])

X_train, y_train = prepare(vector_model, tweets_train['clean_text'], targets_train)


le_test = LabelEncoder()
le_test.fit(tweets_test["target"])
targets_test = tweets_test["target"].apply(lambda x: le_test.transform([x])[0])

X_test, y_test = prepare(vector_model, tweets_test['clean_text'], targets_test)

2000it [00:00, 4149.41it/s]
499it [00:00, 3944.70it/s]


In [14]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(embeddings, targets, test_size=0.2, stratify=targets, random_state=42)

In [11]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [14]:
param = {'max_depth': 5, 'eta': 0.4, 'objective': 'multi:softprob', 'num_class': 4}
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.66014	train-auc:0.85798
[1]	eval-auc:0.69002	train-auc:0.90772
[2]	eval-auc:0.71144	train-auc:0.94096
[3]	eval-auc:0.71597	train-auc:0.95887
[4]	eval-auc:0.71504	train-auc:0.97179
[5]	eval-auc:0.72220	train-auc:0.97995
[6]	eval-auc:0.72559	train-auc:0.98596
[7]	eval-auc:0.72227	train-auc:0.99067
[8]	eval-auc:0.72263	train-auc:0.99361
[9]	eval-auc:0.73205	train-auc:0.99543
[10]	eval-auc:0.73432	train-auc:0.99708
[11]	eval-auc:0.73120	train-auc:0.99827
[12]	eval-auc:0.73327	train-auc:0.99883
[13]	eval-auc:0.73786	train-auc:0.99941
[14]	eval-auc:0.73777	train-auc:0.99963
[15]	eval-auc:0.73852	train-auc:0.99978
[16]	eval-auc:0.74137	train-auc:0.99984
[17]	eval-auc:0.74157	train-auc:0.99992
[18]	eval-auc:0.74131	train-auc:0.99992
[19]	eval-auc:0.74177	train-auc:0.99994
[20]	eval-auc:0.74137	train-auc:0.99996
[21]	eval-auc:0.74213	train-auc:0.99998
[22]	eval-auc:0.74222	train-auc:0.99998
[23]	eval-auc:0.74385	train-auc:0.99998
[24]	eval-auc:0.74497	train-auc:0.99999
[25]	eval-

In [15]:
from sklearn.metrics import classification_report

y_hat = bst.predict(dtest)

y_hat = np.argmax(y_hat, axis=1)
report = classification_report(y_test, y_hat)
print(report)

              precision    recall  f1-score   support

           0       0.55      0.49      0.52       111
           1       0.58      0.64      0.61       165
           2       0.39      0.60      0.47       115
           3       0.67      0.27      0.38       108

    accuracy                           0.52       499
   macro avg       0.55      0.50      0.50       499
weighted avg       0.55      0.52      0.51       499



In [20]:
predicted_labels = le_test.inverse_transform(y_hat)
tweets_test['prediction'] = predicted_labels
tweets_test['MW'] = 'NA'
tweets_test['JP'] = 'NA'
tweets_test['KS'] = 'NA'

In [78]:
annotators_df = pd.read_csv('../../data/original_data.tsv', sep='\t', converters={'target': str, 'id': str})
annotators_df.loc[annotators_df['MW'].isna(), 'MW'] = 'NA'
annotators_df.loc[annotators_df['JP'].isna(), 'JP'] = 'NA'
annotators_df.loc[annotators_df['KS'].isna(), 'KS'] = 'NA'
text_set = set(annotators_df['full_text'])

def combine_columns(row, df, target_col):
  if row['full_text'] in text_set:
    return df[df['full_text'] == row['full_text']][target_col].iloc[0]
  else:
    return 'NA'

In [80]:
tweets_test['MW'] = tweets_test.apply(lambda row: combine_columns(row, annotators_df, 'MW'), axis=1)
tweets_test['JP'] = tweets_test.apply(lambda row: combine_columns(row, annotators_df, 'JP'), axis=1)
tweets_test['KS'] = tweets_test.apply(lambda row: combine_columns(row, annotators_df, 'KS'), axis=1)

In [82]:
tweets_test.to_csv('../../data/target_comparison.tsv', sep='\t', index=False, columns=['id_str', 'full_text', 'target', 'prediction', 'MW', 'JP', 'KS'])

# PCA

In [69]:
from sklearn.decomposition import PCA

pca = PCA(.95)
pca.fit(embeddings)
pc = pca.transform(embeddings)
# transform test
pca.n_components_
# pca.explained_variance_ratio_

# .98 -> 213 components
# .95 -> 173 components
# .90 -> 144 components
# .80 -> 105 components

197

In [70]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(pc, targets, test_size=0.2, stratify=targets, random_state=42)

dtrain = xgb.DMatrix(X_train_pca, label=y_train_pca)
dtest = xgb.DMatrix(X_test_pca, label=y_test_pca)

In [72]:
param = {'max_depth': 4, 'eta': 0.3, 'objective': 'multi:softprob', 'num_class': 4}
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 200
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.68508	train-auc:0.79545
[1]	eval-auc:0.70653	train-auc:0.83425
[2]	eval-auc:0.70715	train-auc:0.85075
[3]	eval-auc:0.70753	train-auc:0.87615
[4]	eval-auc:0.71748	train-auc:0.89817
[5]	eval-auc:0.71522	train-auc:0.91128
[6]	eval-auc:0.71982	train-auc:0.92539
[7]	eval-auc:0.72236	train-auc:0.93521
[8]	eval-auc:0.72171	train-auc:0.94535
[9]	eval-auc:0.72205	train-auc:0.95535
[10]	eval-auc:0.72137	train-auc:0.96278
[11]	eval-auc:0.72152	train-auc:0.96811
[12]	eval-auc:0.72284	train-auc:0.97276
[13]	eval-auc:0.72459	train-auc:0.97666
[14]	eval-auc:0.72522	train-auc:0.98141
[15]	eval-auc:0.72734	train-auc:0.98477
[16]	eval-auc:0.72419	train-auc:0.98745
[17]	eval-auc:0.72763	train-auc:0.98962
[18]	eval-auc:0.72738	train-auc:0.99122
[19]	eval-auc:0.72640	train-auc:0.99259
[20]	eval-auc:0.72790	train-auc:0.99382
[21]	eval-auc:0.72920	train-auc:0.99513
[22]	eval-auc:0.73054	train-auc:0.99603
[23]	eval-auc:0.72972	train-auc:0.99675
[24]	eval-auc:0.72881	train-auc:0.99722
[25]	eval-

In [73]:
from sklearn.metrics import classification_report

y_hat = bst.predict(dtest)

y_hat = np.argmax(y_hat, axis=1)
report = classification_report(y_test, y_hat)
print(report)

              precision    recall  f1-score   support

           0       0.48      0.40      0.44       105
           1       0.52      0.61      0.56       170
           2       0.50      0.55      0.52       156
           3       0.25      0.14      0.18        69

    accuracy                           0.48       500
   macro avg       0.44      0.43      0.43       500
weighted avg       0.47      0.48      0.47       500

