In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string
import re
import os
import sys

# import tensorflow
# import keras
# import h5py
# import progressbar

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter
from wordcloud import WordCloud

from pickle import load
import numpy as np
import argparse

PATH_CURRENT = '/home/jupyter/meme_hateful_detection'
PATH_TRAIN_MEMES = f'{PATH_CURRENT}/data/raw/facebook_memes'
PATH_MEMES_DATASET = f'{PATH_CURRENT}/data/raw/datasets/hateful_memes/defaults/annotations'
PATH_INTERIM = f'{PATH_CURRENT}/data/interim'

import warnings
warnings.filterwarnings('ignore')

In [None]:
MAX_NB_WORDS = 3000000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) 
nlp.max_length = MAX_NB_WORDS 

#  1. Exploratory Data Analysis

## Context
As said above, we've worked on a dataset of job descriptions and their meta information in which a small proportion of these descriptions were fake or scam, which can be identified by the column "fraudulent".

## Data files
train.jsonl - the training set
dev.jsonl - the development set
test.jsonl - the test set

### Columns
* id:  Meme id
* img: Meme image file
* text: A string representing the text in the meme image
* label: Probability that the meme is hateful.

In [None]:
df_memes_dev = pd.read_json(f'{PATH_MEMES_DATASET}/dev.jsonl', lines=True, orient='records')
df_memes_dev.head()

In [None]:
df_memes_train = pd.read_json(f'{PATH_MEMES_DATASET}/train.jsonl', lines=True, orient='records')
df_memes_train.head()

In [None]:
df_memes_test = pd.read_json(f'{PATH_MEMES_DATASET}/test.jsonl', lines=True, orient='records')
df_memes_test.head()

In [None]:
df_memes_normal  = df_memes_train[df_memes_train['label']==0]
df_memes_hateful = df_memes_train[df_memes_train['label']==1]

# 2. Word Exploratory Data Analisis

In [None]:
df_memes_test['text'].isna().value_counts()

In [None]:
df_memes_test.count()

In [None]:
def series_to_str(series_column):
    '''This function converts a series to text, concatenating its values'''
    str_text = ' '.join(series_column)
    return(clean_text(str_text))
    
def clean_text(str_text_raw):
    '''This function clean a given text'''
    str_text = str_text_raw.lower()
    return(str_text)
  
def clean_image_path(str_image):
    str_image_clean = str_image.replace('img/','')
    return(str_image_clean)

In [None]:
def string_to_token(string, str_pickle = None):
    '''
    This function takes a sentence and returns the list of tokens and all their information
    * Text: The original text of the lexeme.
    * Lemme: Lexeme.
    * Orth: The hash value of the lexeme.
    * is alpha: Does the lexeme consist of alphabetic characters?
    * is digit: Does the lexeme consist of digits?
    * is_title: Is the token in titlecase? 
    * is_punct: Is the token punctuation?
    * is_space: Does the token consist of whitespace characters?
    * is_stop: Is the token part of a “stop list”?
    * is_digit: Does the token consist of digits?
    * lang: Language of the token
    * tag: Fine-grained part-of-speech. The complete list is in: 
    https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html, also using: spacy.explain("RB")
    * pos: Coarse-grained part-of-speech.
    * has_vector: A boolean value indicating whether a word vector is associated with the token.
    * vector_norm: The L2 norm of the token’s vector representation.
    * is_ovv: '''
    doc = nlp(string)
    l_token = [[token.text, token.lemma_, token.orth, token.is_alpha, token.is_digit, token.is_title, token.lang_, 
        token.tag_, token.pos_, token.has_vector, token.vector_norm, token.is_oov]
        for token in doc if not token.is_punct | token.is_space | token.is_stop | token.is_digit | token.like_url 
               | token.like_num | token.like_email & token.is_oov]
    df_token = pd.DataFrame(l_token, columns=['text', 'lemme', 'orth', 'is_alpha', 'is_digit', 'is_title', 'language',
                                          'tag', 'part_of_speech', 'has_vector', 'vector_norm', 'is_oov'])
    #Convert plural text to singular
    df_token['text_to_singular'] = np.where(df_token['tag'].isin(['NNPS', 'NNS']), df_token['lemme'], df_token['text'])
    if(str_pickle!=None):
        df_token.to_pickle(f'../data/pickles/{str_pickle}.pkl')
    del l_token
    return(df_token)

def list_to_bow(l_words):
    '''
    This function takes a list of words and create the bag of words ordered by desc order
    '''
    cv = CountVectorizer(l_words)
    # show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
    count_vector=cv.fit_transform(l_words)
    word_freq = Counter(l_words)
    print(f'Bag of words size: {count_vector.shape}\nUnique words size: {len(word_freq)}')
    dict_word_freq = dict(word_freq.most_common())
    return(dict_word_freq)

def create_wordcloud(dict_words, b_plot=False):
    wordcloud = WordCloud(width = 1000, height = 500, normalize_plurals=True).generate_from_frequencies(dict_words)
    if(b_plot==True):
      plt.figure(figsize=(20,8))
      plt.imshow(wordcloud)
      plt.axis('off')
      plt.title("Most lemma words", fontsize=25)
      plt.show
    return(wordcloud)

def apply_cleaning(string):
    '''
    This function takes a sentence and returns a clean text
    '''
    doc = nlp(clean_text(string))
    l_token = [token.text for token in doc if not token.is_punct | token.is_space | token.is_stop | 
               token.is_digit | token.like_url | token.like_num | token.like_email & token.is_oov]
    return ' '.join(l_token)

def apply_lemma(string):
    '''
    This function takes a sentence and returns a clean text
    '''
    doc = nlp(clean_text(string))
    l_token = [token.lemma_ for token in doc if not token.is_punct | token.is_space | token.is_stop | 
               token.is_digit | token.like_url | token.like_num | token.like_email & token.is_oov]
    return ' '.join(l_token)

In [None]:
str_text_total_clean  = series_to_str(df_memes_train.text)
str_text_total_normal = series_to_str(df_memes_normal.text)
str_text_total_hateful= series_to_str(df_memes_hateful.text)
print(f'Total bow with lenght: {len(str_text_total_clean)}')
print(f'Total normal memes with lenght: {len(str_text_total_normal)}')
print(f'Total hateful memes with lenght: {len(str_text_total_hateful)}')

In [None]:
df_token = string_to_token(str_text_total_clean)
dict_word_freq_lemme = list_to_bow(list(df_token['lemme']))
wordcloud = create_wordcloud(dict_word_freq_lemme)
wordcloud = WordCloud(width = 1000, height = 500, normalize_plurals=True).generate_from_frequencies(dict_word_freq_lemme)
plt.figure(figsize=(20,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title("Most lemma words", fontsize=25)
plt.show

In [None]:
df_token = string_to_token(str_text_total_normal)
dict_word_freq_lemme = list_to_bow(list(df_token['lemme']))
wordcloud = WordCloud(width = 1000, height = 500, normalize_plurals=True).generate_from_frequencies(dict_word_freq_lemme)
plt.figure(figsize=(20,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title("Most lemma words", fontsize=25)
plt.show

In [None]:
df_token = string_to_token(str_text_total_hateful)
dict_word_freq_lemme = list_to_bow(list(df_token['lemme']))
wordcloud = WordCloud(width = 1000, height = 500, normalize_plurals=True).generate_from_frequencies(dict_word_freq_lemme)
plt.figure(figsize=(20,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title("Most lemma words", fontsize=25)
plt.show

In [None]:
df_memes_dev['img']   = df_memes_dev['img'].apply(lambda x: clean_image_path(x))
df_memes_train['img'] = df_memes_train['img'].apply(lambda x: clean_image_path(x))
df_memes_test['img']  = df_memes_test['img'].apply(lambda x: clean_image_path(x))
df_memes_dev['text_clean']   = df_memes_dev['text'].apply(lambda x: apply_cleaning(x))
df_memes_train['text_clean'] = df_memes_train['text'].apply(lambda x: apply_cleaning(x))
df_memes_test['text_clean']  = df_memes_test['text'].apply(lambda x: apply_cleaning(x))
df_memes_dev['text_lemma']   = df_memes_dev['text'].apply(lambda x: apply_lemma(x))
df_memes_train['text_lemma'] = df_memes_train['text'].apply(lambda x: apply_lemma(x))
df_memes_test['text_lemma']  = df_memes_test['text'].apply(lambda x: apply_lemma(x))
df_memes_dev.head()

In [None]:
df_memes_dev.to_pickle(f'{PATH_INTERIM}/df_memes_dev_clean.pkl') #export pickle
df_memes_train.to_pickle(f'{PATH_INTERIM}/df_memes_train_clean.pkl')
df_memes_test.to_pickle(f'{PATH_INTERIM}/df_memes_test_clean.pkl')