In [1]:
import numpy as np
import pandas as pd
import re
import string

In [2]:
data = pd.read_csv('labeled_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


#data preprocessing

In [3]:
def clean(text):
    text = str(text).lower() #convert to lowercase
    text = re.sub('', '', text) #remove empty spaces
    text = re.sub('https?://\S+|www\.\S+', '', text) #remove urls
    text = re.sub('<.*?>+', '', text) #remove html tages
    text = text.translate(str.maketrans('','', string.punctuation)) #remove punctuation
    text = re.sub('\n', '', text) #remove newlines
    text = re.sub('\w*\d\w*', '', text) # remove words containing digits
    return text

In [4]:
data["tweet"] = data['tweet'].apply(clean)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,rt mayasolovely as a woman you shouldnt compl...
1,1,3,0,3,0,1,rt boy dats coldtyga dwn bad for cuffin dat ...
2,2,3,0,3,0,1,rt urkindofbrand dawg rt you ever fuck a bit...
3,3,3,0,2,1,1,rt cganderson vivabased she look like a tranny
4,4,6,0,6,0,1,rt shenikaroberts the shit you hear about me ...


## GloVe Embedding

In [6]:
def load_glove_embedding(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [13]:
def get_text_embedding(text, embeddings_index, embedding_dim=50):
    tokens = clean(text).split()
    embedding_matrix = np.zeros((len(tokens), embedding_dim))

    for i, token in enumerate(tokens):
        if token in embeddings_index:
            embedding_matrix[i] = embeddings_index[token]
            print(f"Vector for '{token}':\n{embedding_matrix[i]}")
        else:
            print(f"Word '{token}' not found in the GloVe embeddings.")
    return np.mean(embedding_matrix, axis=0)

In [8]:
glove_file_path = 'glove.6B.50d.txt'

In [14]:
embeddings_index = load_glove_embedding(glove_file_path)

In [15]:
text_embedding = get_text_embedding(data, embeddings_index)

Vector for 'unnamed':
[ 0.70696002  0.85861999  0.40237001  0.50555998  0.51581001  0.14504001
 -0.59467     0.92992997  0.047511   -0.89498001 -0.15103     0.20306
  0.61654001  0.52529001 -0.0824     -0.49447    -0.63704997  0.37593001
  0.14463     0.59539998 -0.073085    0.07505     0.49399999  0.74923998
  0.20867001 -1.65009999 -0.31593001  0.48789999 -0.83216     0.56468999
  1.62129998 -1.45239997  0.83184999 -0.21170001  0.1204      0.0024316
 -0.60121    -0.82165003 -0.29438999 -0.33428001  0.43020999  0.024244
  0.33001     0.10939     0.50326997 -1.15840006 -0.73136002  1.19500005
  0.67730999 -0.28321001]
Vector for 'count':
[ 0.42736    -0.1601      0.31310001 -0.30712     0.80713999  0.65256
  0.69137001  1.03779995 -0.10361    -0.2262     -0.15936001 -0.31953999
 -0.25317001 -1.12370002  1.13139999 -0.94373    -0.22373    -1.05799997
 -0.36901999 -0.90653002  0.11563    -1.32389998  0.52449     0.22635999
 -0.076797   -1.41480005 -1.12349999 -0.82325     0.2881     -0.2

In [17]:
print("Text embedding:")
print(text_embedding)

Text embedding:
[-3.90612085e-04  1.58034647e-01  3.50292978e-02 -1.82952092e-01
  2.63807309e-01  2.42080633e-02 -1.14983762e-01 -1.26777751e-01
 -1.61332394e-01  1.00452860e-01 -9.60680472e-02  2.66233001e-01
 -1.77654704e-01  2.18174044e-02  2.05872695e-01  6.19041888e-03
 -7.43728890e-02  8.74194795e-02 -1.06314512e-01 -2.35328901e-01
 -2.46352441e-01  1.77225464e-01  2.95069205e-01  1.74787250e-01
  2.03961742e-01 -8.54275664e-01 -4.36268821e-01  2.05245065e-01
  2.97088106e-01 -4.24369911e-01  1.62910175e+00  1.90930903e-01
 -8.39779262e-02  1.29159956e-01 -4.54921052e-02 -1.83910060e-02
  1.69773966e-01 -1.24969898e-01  1.34921146e-01 -1.64536678e-01
  9.44162739e-02  6.46568132e-02 -1.74598062e-01  1.77967557e-01
  1.42745097e-01 -4.91231148e-02  1.60888600e-02 -2.75993871e-01
  1.16726683e-01  2.54611367e-01]
