In [None]:
import pandas as pd
import numpy as np

#  1. Exploratory Data Analysis

## Context
As said above, we've worked on a dataset of job descriptions and their meta information in which a small proportion of these descriptions were fake or scam, which can be identified by the column "fraudulent".

## Data files
train.jsonl - the training set
dev.jsonl - the development set
test.jsonl - the test set

### Columns
* id:  Meme id
* img: Meme image file
* text: A string representing the text in the meme image
* label: Probability that the meme is hateful.

In [None]:
pd_memes_dev = pd.read_json('../data/dev.jsonl', lines=True)
pd_memes_dev.head()

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime


In [None]:
pd_memes_normal  = pd_memes_dev[pd_memes_dev['label']==0]
pd_memes_hateful = pd_memes_dev[pd_memes_dev['label']==1]

# 2. Word Exploratory Data Analisis

In [None]:
pd_memes_dev['text'].isna().value_counts()

False    500
Name: text, dtype: int64

In [None]:
def series_to_str(series_column):
    '''This function converts a series to text, concatenating its values'''
    return(' '.join(series_column))
    
# def clean_punctuation(str_text_raw):
#     '''This function replace some of the troublemaker puntuation elements in a given text'''
#     return(re.sub('[$\(\)/|{|\}#~\[\]^#;:!?¿]', ' ', str_text_raw))

# def clean_unicode(str_text_raw):
#     '''This function eliminate non-unicode text'''
#     str_text = re.sub('&amp;', '', str_text_raw)
#     return(re.sub(r'[^\x00-\x7F]+',' ', str_text))
                      
# def clean_dot_words(str_text_raw):
#     '''This function replace the dots between words'''
#     return(re.sub(r'(\w+)\.+(\w+)', r'\1 \2',str_text_raw))

# def clean_text(str_text_raw):
#     '''This function clean a given '''
#     str_text = str_text_raw.lower()
#     str_text = clean_dot_words(clean_punctuation(clean_unicode(clean_url(str_text))))
#     return(str_text)

In [None]:
str_text_total   = series_to_str(pd_memes_dev.text)
str_text_normal  = series_to_str(pd_memes_normal.text)
str_text_hateful = series_to_str(pd_memes_hateful.text)
print(f'Total bow with lenght: {len(str_text_total)}')
print(f'Total normal memes with lenght: {len(str_text_normal)}')
print(f'Total hateful memes with lenght: {len(str_text_hateful)}')

Total bow with lenght: 27644
Total normal memes with lenght: 13610
Total hateful memes with lenght: 14033


In [None]:
dict_word_freq_lemma = list_to_bow(list(pd_token['lemma']))
wordcloud = WordCloud(width = 1000, height = 500, normalize_plurals=True).generate_from_frequencies(dict_word_freq_lemma)
plt.figure(figsize=(20,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title("Most frequent words'lemmas", fontsize=25)
plt.savefig("images/output_images/lemmasWC.png")
plt.show

'white people is this a shooting range bravery at its finest your order comes to $37.50 and your white privilege discount brings the total to $37.50 it is time.. to send these parasites back to the desert mississippi wind chime knowing white people , that\'s probably the baby father life hack #23 how to get stoned with no weed you\'ve heard of elf on a shelf, now get ready for cooooooooooooon!!!! when you get to choose your own mental illness dat ass ahmarbkrich*d look! it says it right here! we can fuck goats! enough is enough children are more important than freaks removes polish with chemicals nobody bats an eye removes polish with chemicals everybody loses his shit when you have an 80 point iq but it\'s the highest in your village my sense of humor is so dark it picks cotton if this offends you you might be lack toes intolerant yes, we know trump said "mexico will pay for the wall" but now mexicans are the wall. that boy good!!!! they don\'t wash their ass with water and they eat t

In [None]:
def clean_urls(column):
    '''
    This function takes a string and returns a string 
    with its urls removed and all the words in lowercase.
    '''
    return column.apply(lambda x: x.lower()).apply(lambda x: re.sub('http[s]?://\S+', '', x))



text_columns['text'] = clean_urls(text_columns['text'])