In [14]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:

df = pd.read_csv('../data/df_sample25.csv')
new_row = pd.DataFrame({
    'TweetID': [1344796664837637121],
    'LangID': [1],
    'TopicID': [1],
    'HateLabel': [2],
    'TweetText': ['I hate all people in this word!, I am really angry kill kill kill, death!']
})
df = pd.concat([df, new_row], ignore_index=True)
df['TweetText'].fillna('', inplace=True)

In [12]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec
import numpy as np

def preprocessing(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.replace('’', '')
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## stay with letter
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ w for w in tokenized_sentence if not w in stop_words] ## remove stopwords
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized]
    cleaned_text = ' '.join(word for word in noun_lemmatized)
    return cleaned_text

def tokenizer(df):
    X_train = [text_to_word_sequence(_) for _ in df["Cleaned_text"]]
    return X_train

def vectorizer(X_train):
    word2vec = Word2Vec(sentences=X_train, vector_size=100, window=3)
    return word2vec

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])

    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []

    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)

    return embed

In [15]:

df['Cleaned_text'] = df['TweetText'].apply(preprocessing)

#train X
X_train = tokenizer(df)

#vectorize words in sentence
word2vec = vectorizer(X_train)

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)

# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)