In [1]:
import numpy as np
import statistics

from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from tqdm import tqdm
import pickle
import json

In [2]:
with open('unique_word_vectors.pickle', 'rb') as f:
    unique_word_vectors = pickle.load(f)

with open('agg_tweet_ratings.json') as f:
    agg_tweet_ratings = json.load(f)['data']

In [3]:
random_seeds = [128, 101, 77, 34, 255, 67, 195, 3, 222, 234]

# Get vocabulary vectors

In [None]:
import gensim.downloader

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-100')

In [None]:
with open('agg_tweet_ratings.json') as f:
    agg_tweet_ratings = json.load(f)['data']

unique_word_vectors = {}
for tweet in agg_tweet_ratings:
    clean_text = tweet['clean_text']
    tokens = word_tokenize(clean_text)
    for word in tokens:
        if word not in unique_word_vectors:
            if word in glove_vectors:
                unique_word_vectors[word] = glove_vectors[word]
            else:
                unique_word_vectors[word] = None

In [None]:
import pickle
with open('unique_word_vectors.pickle', 'wb') as f:
    pickle.dump(unique_word_vectors, f)

In [None]:
len([elt for elt in unique_word_vectors if unique_word_vectors[elt] is not None]), len(unique_word_vectors)

# Classifying Relevance

In [4]:
X = []
y = []
for tweet in agg_tweet_ratings:
    clean_text = tweet['clean_text']
    tokens = word_tokenize(clean_text)
    tweet_vectors = []
    for word in tokens:
        if unique_word_vectors[word] is not None:
            tweet_vectors.append(unique_word_vectors[word])
    if len(tweet_vectors)>0:
        X.append(np.sum(tweet_vectors, axis=0))
        y.append(tweet['relevance_rating'])
len(X)

6939

In [5]:
def run_models(seed):
    scores = {}
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                     random_state=seed)
    
    mlp_clf = MLPClassifier(random_state=seed, max_iter=500).fit(X_train, y_train)
    y_pred = mlp_clf.predict(X_test)
    scores['mlp_f1'] = f1_score(y_test, y_pred, average='macro')

    log_reg_clf = LogisticRegression(random_state=seed, solver='liblinear', max_iter=300).fit(X_train, y_train)
    y_pred = log_reg_clf.predict(X_test)
    scores['log_reg_f1'] = f1_score(y_test, y_pred, average='macro')

    svm_clf = svm.SVC(random_state=seed)
    svm_clf.fit(X_train, y_train)
    y_pred = svm_clf.predict(X_test)
    scores['svm_f1'] = f1_score(y_test, y_pred, average='macro')

    return scores

In [6]:
all_scores = []
for seed in tqdm(random_seeds):
    all_scores.append(run_models(seed))

100%|██████████████████████████████████████████| 10/10 [01:10<00:00,  7.05s/it]


In [7]:
log_reg_scores = [elt['log_reg_f1'] for elt in all_scores]
print('log_reg_f1', round(sum(log_reg_scores)/len(log_reg_scores), 3), round(statistics.stdev(log_reg_scores),3))
mlp_scores = [elt['mlp_f1'] for elt in all_scores]
print('mlp_f1', round(sum(mlp_scores)/len(mlp_scores), 3), round(statistics.stdev(mlp_scores), 3))
svm_scores = [elt['svm_f1'] for elt in all_scores]
print('svm_f1', round(sum(svm_scores)/len(svm_scores), 3), round(statistics.stdev(svm_scores), 3))


log_reg_f1 0.766 0.011
mlp_f1 0.756 0.013
svm_f1 0.754 0.011


# Classifying Xenophobia (3-class)

In [4]:
def colapse_to_three_categories(r):
    if r < 0:
        return 0
    elif r > 0:
        return 2
    else:
        return 1

In [5]:
X = []
y = []
for tweet in agg_tweet_ratings:
    if tweet['relevance_rating']:
        clean_text = tweet['clean_text']
        tokens = word_tokenize(clean_text)
        tweet_vectors = []
        for word in tokens:
            if unique_word_vectors[word] is not None:
                tweet_vectors.append(unique_word_vectors[word])
        if len(tweet_vectors)>0:
            X.append(np.sum(tweet_vectors, axis=0))
            y.append(colapse_to_three_categories(tweet['xm_rating']))
len(X)

3741

In [6]:
def run_models(seed):
    scores = {}
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                     random_state=seed)

    # print('training Logistic Regression Classifier')
    log_reg_clf = LogisticRegression(random_state=seed, solver='newton-cholesky', max_iter=300).fit(X_train, y_train)
    # log_reg_clf = LogisticRegression(random_state=seed, solver='lbfgs', max_iter=300).fit(X_train, y_train)
    scores['log_reg_acc'] = log_reg_clf.score(X_test, y_test)

    return scores

In [7]:
all_scores = []
for seed in tqdm(random_seeds):
    all_scores.append(run_models(seed))

100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 19.22it/s]


In [8]:
log_reg_scores = [elt['log_reg_acc'] for elt in all_scores]
print('log_reg_acc', round(sum(log_reg_scores)/len(log_reg_scores), 3), round(statistics.stdev(log_reg_scores),3))

log_reg_acc 0.811 0.007


# Classifying Xenophobia (7-class)

In [9]:
X = []
y = []
z = []
for tweet in agg_tweet_ratings:
    if tweet['relevance_rating']:
        clean_text = tweet['clean_text']
        tokens = word_tokenize(clean_text)
        tweet_vectors = []
        for word in tokens:
            if unique_word_vectors[word] is not None:
                tweet_vectors.append(unique_word_vectors[word])
        if len(tweet_vectors)>0:
            X.append(np.sum(tweet_vectors, axis=0))
            y.append(round(tweet['xm_rating']))
            z.append(tweet['xm_rating'])
len(X)

3741

In [10]:
def run_models(seed):
    scores = {}
    X_train, X_test, z_train, z_test = train_test_split(X, z, #stratify=z,
                                                     random_state=seed)

    X_train, X_test, y_train, y_test = train_test_split(X, y, #stratify=z,
                                                     random_state=seed)
    
    lin_reg = LinearRegression().fit(X_train, z_train)
    scores['lin_reg_acc'] = lin_reg.score(X_test, z_test)

    ridge_reg = Ridge(alpha=1.0, solver='sag', random_state=seed, max_iter=300).fit(X_train, z_train)
    scores['rigde_reg_acc'] = ridge_reg.score(X_test, z_test)

    knn_reg = KNeighborsRegressor(n_neighbors=7)
    knn_reg.fit(X_train, z_train)
    scores['knn_reg_acc'] = knn_reg.score(X_test, z_test)
    
    knn_clf = KNeighborsClassifier(n_neighbors=7)
    knn_clf.fit(X_train, y_train)
    scores['knn_clf_acc'] = knn_clf.score(X_test, y_test)
    
    return scores

In [11]:
all_scores = []
for seed in tqdm(random_seeds):
    all_scores.append(run_models(seed))

100%|██████████████████████████████████████████| 10/10 [00:07<00:00,  1.34it/s]


In [12]:
lin_reg_scores = [elt['lin_reg_acc'] for elt in all_scores]
print('lin_reg_acc', round(sum(lin_reg_scores)/len(lin_reg_scores), 3), round(statistics.stdev(lin_reg_scores),3))
rigde_reg_scores = [elt['rigde_reg_acc'] for elt in all_scores]
print('rigde_reg_acc', round(sum(rigde_reg_scores)/len(rigde_reg_scores), 3), round(statistics.stdev(rigde_reg_scores), 3))
knn_reg_scores = [elt['knn_reg_acc'] for elt in all_scores]
print('knn_reg_acc', round(sum(knn_reg_scores)/len(knn_reg_scores), 3), round(statistics.stdev(knn_reg_scores), 3))
knn_clf_scores = [elt['knn_clf_acc'] for elt in all_scores]
print('knn_clf_acc', round(sum(knn_clf_scores)/len(knn_clf_scores), 3), round(statistics.stdev(knn_clf_scores), 3))


lin_reg_acc 0.416 0.017
rigde_reg_acc 0.416 0.017
knn_reg_acc 0.426 0.02
knn_clf_acc 0.445 0.01
