In [64]:
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer 
sentiment = 'subtask_a'
(tweet_index, subtask_a_index) = (0, 1)
tweet = 'tweet'
num_features = 100  # Word vector dimensionality
min_word_count = 10 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 3         # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words
name = f'num_features_{num_features}_min_word_count_{min_word_count}'

In [65]:
import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

In [66]:
def load_data_from_file(path_name, train):
    path = './data/start-kit/'+path_name
    df = pd.read_csv(path, sep='\t', header=0 if train else None)
    return df


def update_tweet_column(df, f, test=False):
    indx = tweet if not test else tweet_index
    df[indx] = df[indx].apply(f)
    return df 

    

In [67]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()


pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')



def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()



In [68]:
def avg_sentence_vector(tweet, model, num_features):
    #function to average all words vectors in a given tweet
    words = tweet.split()
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in model.wv:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [69]:
from gensim.models import word2vec
def train_model(df, file_name):
    tweets = df[tweet].map(lambda x: x.split())
    print("Training model....")
    model = word2vec.Word2Vec(tweets,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)

    # To make the model memory efficient
    model.init_sims(replace=True)

    # Saving the model for later use. Can be loaded using Word2Vec.load()
    model_name = f'{num_features}features_{min_word_count}minwords_{context}context'
    model.save(file_name+model_name)
    
    print('finished')

    return model


def load_and_preprocess_training_data(file_name):
    df = load_data_from_file(file_name, True)
    df.set_index('id')
    df = df[[tweet, 'id', sentiment]]
    df['pre_clean_len'] = [len(t) for t in df[tweet]]
    df[sentiment]= df[sentiment].astype(str)
    df = update_tweet_column(df, tweet_cleaner)
    df['post_clean_len'] = [len(t) for t in df[tweet]]
   
    return df
    
    
train_data = load_and_preprocess_training_data('training-v1/offenseval-training-v1.tsv')
# 
train_data.head()
model = train_model(train_data, './word2vec/')


Training model....
finished


In [70]:
train_data.head()

Unnamed: 0,tweet,id,subtask_a,pre_clean_len,post_clean_len
0,she should ask few native americans what their...,86426,OFF,71,62
1,go home you re drunk maga trump url,90194,OFF,67,35
2,amazon is investigating chinese employees who ...,16820,NOT,182,176
3,someone should vetaken this piece of shit to v...,62688,OFF,65,52
4,obama wanted liberals illegals to move into re...,43605,NOT,72,54


In [71]:
def save_train_data(df):
    df = df[[tweet, 'id']]
    tweets = df[tweet].values
    vectors = list(map(lambda x: avg_sentence_vector(x, model, num_features), tweets))
    vectorised_df = pd.DataFrame(vectors)
    vectorised_df = vectorised_df.set_index(df['id'], 'id')
#     vectorised_df = df1.assign(e=p.Series(np.random.randn(sLength)).values)
    vectorised_df.to_csv(f'./feature_vectors/train-word2vec-{name}.csv')
    return vectorised_df
train = save_train_data(train_data)
train.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86426,-0.025808,-0.053397,-0.071959,0.007813,-0.037019,-0.011515,-0.060435,-0.014528,-0.054141,0.156113,...,-0.000262,-0.043734,-0.103292,0.095441,-0.086137,-0.167084,-0.083201,0.106072,-0.063322,-0.043647
90194,0.030471,-0.072112,-0.053726,0.001796,-0.02543,-0.022651,-0.013624,0.012562,-0.008981,0.176921,...,0.085329,-0.036498,-0.041904,0.09442,-0.085861,-0.13201,-0.074978,0.103005,-0.056452,-0.002916
16820,0.003218,-0.037586,-0.056901,0.001754,-0.045029,-0.012066,-0.062627,-0.022517,-0.027748,0.190147,...,0.024784,-0.036471,-0.061951,0.132558,-0.080589,-0.177264,-0.099771,0.093564,-0.097557,-0.007
62688,0.006023,-0.052959,-0.089783,0.018669,-0.036615,0.002618,-0.062124,-0.020938,-0.020009,0.149017,...,0.00743,-0.006138,-0.098574,0.106535,-0.075268,-0.162966,-0.085685,0.091813,-0.074811,-0.019161
43605,0.016144,-0.07713,-0.083577,0.022928,-0.066626,-0.0164,-0.051755,-0.026513,-0.034025,0.176835,...,0.043138,-0.005813,-0.103573,0.142955,-0.08836,-0.174962,-0.103088,0.109847,-0.079658,-0.000437


In [72]:
def process_test_data():
    df = load_data_from_file('trial-data/offenseval-trial.txt', False)
    df = df[[tweet_index]]
    df = update_tweet_column(df, tweet_cleaner, True)
    tweets = df[tweet_index].values
    vectors = list(map(lambda x: avg_sentence_vector(x, model, num_features), tweets))
    vectorised_df = pd.DataFrame(vectors)
    vectorised_df.to_csv(f'./feature_vectors/test-word2vec-{name}.csv', index_label='id')
    return vectorised_df
test = process_test_data()
test.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.011055,-0.069653,-0.074623,0.022289,-0.054083,-0.00301,-0.043216,-0.038179,-0.041985,0.146613,...,0.01244,-0.017667,-0.095614,0.110406,-0.078415,-0.152514,-0.102871,0.098589,-0.056031,-0.019192
1,-0.021337,-0.048995,-0.082546,0.005377,-0.054076,-0.034848,-0.066061,-0.02594,-0.070016,0.142591,...,0.00613,-0.018716,-0.117522,0.120904,-0.095736,-0.171728,-0.070688,0.111781,-0.067309,-0.028586
2,-0.022788,-0.054937,-0.07375,0.006791,-0.042893,-0.006603,-0.072625,-0.018558,-0.045568,0.194205,...,0.008616,-0.048817,-0.089716,0.125778,-0.095211,-0.200785,-0.111419,0.117071,-0.089749,-0.028576
3,0.007014,-0.063645,-0.096343,0.021724,-0.04115,-0.016515,-0.045172,-0.023174,-0.022182,0.16748,...,0.031759,-0.012882,-0.094411,0.131319,-0.09734,-0.172573,-0.100675,0.120517,-0.049792,-0.010523
4,-0.014689,-0.054121,-0.068959,0.003368,-0.074294,-0.034042,-0.079448,-0.035391,-0.075968,0.200693,...,0.017985,-0.031521,-0.111532,0.162854,-0.105122,-0.205485,-0.101574,0.112916,-0.101488,-0.017188
