# Milestone 1: preprocessing
___

In [1]:
import pandas as pd
import re
import emoji
import nltk
from nltk.corpus import stopwords
from collections import Counter
import stanza

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/henry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/henry/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
data = pd.read_csv('../data/edos_labelled_individual_annotations.csv')
print("Columns:", list(data.columns))
print("Shape:", data.shape)
data.head()

Columns: ['rewire_id', 'text', 'annotator', 'label_sexist', 'label_category', 'label_vector', 'split']
Shape: (60000, 7)


Unnamed: 0,rewire_id,text,annotator,label_sexist,label_category,label_vector,split
0,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,17,sexist,2. derogation,2.2 aggressive and emotive attacks,train
1,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,2,sexist,2. derogation,2.2 aggressive and emotive attacks,train
2,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,6,not sexist,none,none,train
3,sexism2022_english-1,"What do you guys think about female ""incels""? ...",17,not sexist,none,none,train
4,sexism2022_english-1,"What do you guys think about female ""incels""? ...",15,not sexist,none,none,train


The dataset contains a more fine-grained sexism detection, but we're working only with the `label_sexist`.

In [4]:
data = data.drop(columns=['label_category', 'label_vector'])
data.head()

Unnamed: 0,rewire_id,text,annotator,label_sexist,split
0,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,17,sexist,train
1,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,2,sexist,train
2,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,6,not sexist,train
3,sexism2022_english-1,"What do you guys think about female ""incels""? ...",17,not sexist,train
4,sexism2022_english-1,"What do you guys think about female ""incels""? ...",15,not sexist,train


### Exploratory analysis

In [5]:
print(f"There are: {len(data['annotator'].unique())} different annotators.")
print("Annotator IDs:", sorted(data['annotator'].unique()))

There are: 19 different annotators.
Annotator IDs: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18)]


Each of the 20000 unique comments was annotated by 3 different annotators. In 4444 cases, annotators reached a 2/3 agreement rather than full 3/3.

In [6]:
print(f"There are: {len(data['rewire_id'].unique())} different comments annotated in total.")
print("Minimum number of annotations for a comment:", data['rewire_id'].value_counts().min())
print("Maximum number of annotations for a comment:", data['rewire_id'].value_counts().max())

There are: 20000 different comments annotated in total.
Minimum number of annotations for a comment: 3
Maximum number of annotations for a comment: 3


In [7]:
unique_label_counts = data.groupby('rewire_id')['label_sexist'].nunique() # 1 (3/3 agreement) or 2 (2/3 agreement)
agreement_2_3_count = (unique_label_counts != 1).sum() # number of comments where annotators have 2/3 agreement

print("Number of 'rewire_id' entries (comments) with 2/3 agreement among annotators:", agreement_2_3_count)
print(unique_label_counts)

Number of 'rewire_id' entries (comments) with 2/3 agreement among annotators: 4444
rewire_id
sexism2022_english-0       2
sexism2022_english-1       1
sexism2022_english-10      2
sexism2022_english-100     1
sexism2022_english-1000    1
                          ..
sexism2022_english-9995    1
sexism2022_english-9996    2
sexism2022_english-9997    1
sexism2022_english-9998    2
sexism2022_english-9999    2
Name: label_sexist, Length: 20000, dtype: int64


### Aggregated dataset

**Important note:** From now on in the pre-processing phase, we'll focus solely on the aggregated dataset. Processing the same comments 3 times and storing them in CoNLL-U format is unnecessary (additionally, we only need a single class label for our binary classification task later on). According to the paper describing the dataset, cases with 2/3 annotator agreement were resolved by the dataset creators. (If the aggregated dataset wasn’t available, we would likely rely on majority voting.)

We'll keep in mind that 2/3 agreements exist and revisit this aspect during error analysis in milestone 2.

In [8]:
data_agg = pd.read_csv('../data/edos_labelled_aggregated.csv')
data_agg = data_agg.drop(columns=['label_category', 'label_vector'])
print("Columns:", list(data_agg.columns))
print("Shape:", data_agg.shape)
data_agg

Columns: ['rewire_id', 'text', 'label_sexist', 'split']
Shape: (20000, 4)


Unnamed: 0,rewire_id,text,label_sexist,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,dev
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,train
3,sexism2022_english-13021,woman?,not sexist,train
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,dev
...,...,...,...,...
19995,sexism2022_english-5228,girls really get fucked almost every weekend ?,not sexist,train
19996,sexism2022_english-10140,The hatred for moslems are Reasonable and Just...,not sexist,train
19997,sexism2022_english-9726,Now this is a woman who gets it. 👆,not sexist,train
19998,sexism2022_english-13365,“American Idol” finalist [USER] said nothing i...,not sexist,train


First, the train/test/dev split. Exploratory analysis will proceed on the combined training + validation sets.

In [9]:
# split the data_agg into train, dev and test sets based on the 'split' column
data_agg_train = data_agg[data_agg['split'] == 'train']
data_agg_dev = data_agg[data_agg['split'] == 'dev']
data_agg_test = data_agg[data_agg['split'] == 'test']

print("Number of training samples:", data_agg_train.shape[0]) 
print("Number of validation samples:", data_agg_dev.shape[0]) 
print("Number of test samples:", data_agg_test.shape[0]) 

assert data_agg_train.shape[0] + data_agg_dev.shape[0] + data_agg_test.shape[0] == data_agg.shape[0]

# drop 'split' column
data_agg_train = data_agg_train.drop('split', axis=1)
data_agg_dev = data_agg_dev.drop('split', axis=1)
data_agg_test = data_agg_test.drop('split', axis=1)

Number of training samples: 14000
Number of validation samples: 2000
Number of test samples: 4000


In [10]:
data_agg_train_val = pd.concat([data_agg_train, data_agg_dev], axis=0, ignore_index=True)
print("Shape of the training + validation set:", data_agg_train_val.shape)

Shape of the training + validation set: (16000, 3)


Conclusions drawn using regular expressions:

- `[URL]`, `[USER]` are placeholders used by dataset authors instead of actual URLs and real usernames
- female related nouns and pronouns are more frequent than male
- hashtags `#` often used
- huge amount of profanities

In [11]:
def count_patterns(pattern, data, top_k=20):
    return Counter(match for text in data.text for match in re.findall(pattern, text)).most_common(top_k)

In [12]:
count_patterns(r'\[[A-Z]+\]', data_agg_train_val) # catching: [USER], [URL]
# count_patterns(r'\b(she|her|wom[ae]n|female|girl|lady)\b', data_agg_train_val) # female related nouns, pronouns etc.
# count_patterns(r'\b(he|him|his|m[ae]n|male|boy|guy|dude)\b', data_agg_train_val) # male related nouns, pronouns etc.
# count_patterns(r'#\w+', data_agg_train_val) # hashtag
# count_patterns(r'\b(fuck|shit|damn|asshole|bitch|slut)\b', data_agg_train_val) # profanities

[('[URL]', 2004), ('[USER]', 1066), ('[DJT]', 1), ('[K]', 1)]

Emojis

One approach to identify emojis could be by looking for patterns that include `:` (e.g. we're targeting `:)`, `:(`, `:D`). However, modern text data often represents emojis differently, such as `ðŸ˜‰` for the 😉 (winking face) emoji. Python's `emoji` library is specifically designed for recognizing emojis in text and can replace them with descriptive names (e.g. `:winking_face:`), which is usefull for our application.

In [13]:
print("Example of the comment with emoji: \n", data_agg_train_val.iloc[0]['text'])
print(f"Info about emoji: {data_agg_train_val.iloc[0]['text'][-1]} \n {emoji.EMOJI_DATA[data_agg_train_val.iloc[0]['text'][-1]]}")

Example of the comment with emoji: 
 Then, she's a keeper. 😉
Info about emoji: 😉 
 {'en': ':winking_face:', 'status': 2, 'E': 0.6, 'alias': [':wink:']}


In [14]:
def count_emojis(data, top_k=10):
    return Counter([char for text in data.text for char in text if char in emoji.EMOJI_DATA]).most_common(top_k)

count_emojis(data_agg_train_val)

[('😂', 184),
 ('🤔', 28),
 ('🤣', 26),
 ('😁', 20),
 ('😊', 19),
 ('😄', 18),
 ('❤', 13),
 ('👍', 13),
 ('😰', 12),
 ('🙄', 11)]

In [15]:
def replace_emojis_with_description(text):
    return emoji.demojize(text)

data_agg_train_val['text'] = data_agg_train_val['text'].apply(replace_emojis_with_description)
data_agg_train_val.head()

Unnamed: 0,rewire_id,text,label_sexist
0,sexism2022_english-16993,"Then, she's a keeper. :winking_face:",not sexist
1,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist
2,sexism2022_english-13021,woman?,not sexist
3,sexism2022_english-14998,Unlicensed day care worker reportedly tells co...,not sexist
4,sexism2022_english-7228,[USER] Leg day is easy. Hot girls who wear min...,sexist


In [16]:
count_patterns(r':(.*?):', data_agg_train_val, top_k=10)

[('face_with_tears_of_joy', 183),
 ('thinking_face', 25),
 ('rolling_on_the_floor_laughing', 24),
 ('beaming_face_with_smiling_eyes', 20),
 ('grinning_face_with_smiling_eyes', 18),
 ('smiling_face_with_smiling_eyes', 18),
 ('United_States', 14),
 ('thumbs_up', 13),
 ('anxious_face_with_sweat', 12),
 ('face_with_rolling_eyes', 11)]

### Text normalization

What are the most common words in our concatenated text?

Examining the list of the most frequent words after tokenization and punctuation removal (`[]` were also highly frequent), we found that, alongside expected stopwords (`a`, `the` etc.), there is notable frequency of female-related nouns/pronouns (`her`, `she`, `women`).

In [17]:
words = [word for text in data_agg_train_val['text'] for word in nltk.word_tokenize(text)]
words = [word for word in words if re.match(r'\w', word)] # exclude punctuation
print("Total number of words found after tokenization and punctuation removal:", len(words))
print("Top 20 most common words:", Counter(words).most_common(20))

Total number of words found after tokenization and punctuation removal: 377897
Top 20 most common words: [('the', 10148), ('a', 9499), ('to', 9329), ('I', 6935), ('and', 6905), ('you', 5517), ('is', 5361), ('of', 5279), ('her', 4632), ('that', 4290), ('she', 3975), ('in', 3803), ('it', 3607), ('women', 3346), ('for', 3159), ("n't", 3149), ('are', 2940), ('with', 2674), ('be', 2449), ('on', 2412)]


Out of curiosity, let’s find the context surrounding the first few occurrences of some specific token of interest.

In [18]:
def find_contexts(token, data, context_size=50, limit=10):
    """
    Finds the first 'limit' occurrences of a given 'token' in dataframe 'data'
    with a 'text' column, capturing 'context_size' characters around each occurrence.
    Context: token position +- 'context_size' chars around.
    """
    contexts = []

    for text in data['text']:
        for match in re.finditer(rf'\b{token}\b', text):
            start, end = match.span()
            # get the context around the match
            context = text[max(0, start - context_size):min(len(text), end + context_size)] # +- context_size chars
            contexts.append(context)
            
            if len(contexts) >= limit:
                return contexts[:limit]

    return contexts[:limit]

find_contexts("women", data_agg_train_val)
# find_contexts("she", data_agg_train_val)
# find_contexts("yourself", data_agg_train_val)

['eed to stop referring to all men as kings and all women as queens, this shit is getting out of hand.',
 ' like 90% of people i know IRL it just so happens women are disgusted by me-and i dont even do it IRL ?',
 "or Assault and based on his claim that he's raped women, he should be investigated. His bragging could be",
 ' you have few options. You drop a few points with women doing that. A guy with options doesn’t give a shi',
 'The she-god mentality of women is going down the drain. Men make the world work ',
 "nt pussy. Sexbots will be realistic very soon and women's days of pussy rule will be over. &#x200B;",
 ' Europe (France, Germany etc.) is easier and that women do approach men. Here they never approach, even c',
 'Some hard core women haters use mgtow as an excuse, not a lifestyle.',
 'that does not reflect poorly on men nor well upon women. It just means she is extraordinary, each sex has',
 'Yeah, well, plenty of men have been hit by women without ever being able to hit bac

##### Stopword removal

`she`, `herself`, as well as `he`, `himself` etc. should be excluded from stopwords, because we assume that those pronouns provide relevant information for our task and preserving them will be beneficial.

In [19]:
stopwords_set = set(stopwords.words('english'))
stopwords_set = stopwords_set - {'she', 'she\'s', 'herself', 'her', 'hers', 'he', 'himself', 'him', 'his', 'yourself', 'yourselves', 'your', 'yours'}
print(sorted(stopwords_set))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'here', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them'

After stopword filtering and analyzing the most commonly occuring words, we can observe female-related nouns/pronounces, `URL`, `USER` tokens, as well as some profanities.

In [20]:
words = [word.lower() for word in words if word.lower() not in stopwords_set]
print("Total number of words found after tokenization, punctuation removal and stopword filtering:", len(words))
print("Top 20 most common words:", Counter(words).most_common(20))

Total number of words found after tokenization, punctuation removal and stopword filtering: 213597
Top 20 most common words: [('she', 4884), ('her', 4749), ('women', 3580), ("n't", 3165), ('url', 2006), ('like', 1942), ('your', 1657), ('he', 1512), ('get', 1369), ('woman', 1363), ('would', 1253), ('men', 1236), ('user', 1073), ('one', 936), ('girls', 875), ('girl', 875), ('fuck', 874), ('his', 846), ('want', 799), ('think', 784)]


##### Lemmatization

The process of reducing words to their base or dictionary form (lemma). We prefer lemmatization to stemming because it considers the word's context, making the reduction process more accurate. 

Let's just show one example...

In [21]:
nlp = stanza.Pipeline('en', processors='tokenize,lemma,pos') 

2024-11-09 20:30:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 37.8MB/s]                    
2024-11-09 20:30:37 INFO: Downloaded file to /home/henry/stanza_resources/resources.json
2024-11-09 20:30:37 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-09 20:30:37 INFO: Using device: cuda
2024-11-09 20:30:37 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-09 20:30:38 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-09 

In [22]:
text = data_agg_train_val.iloc[1]['text']
doc = nlp(text) # process the text with the pipeline

for sentence in doc.sentences[:1]:
    for word in sentence.words[:5]: # show the 1st 5 words of the 1st sentence
        print('\t'.join([word.text, word.lemma, word.upos, word.feats if word.feats else '']))
    print() 

This	this	PRON	Number=Sing|PronType=Dem
is	be	AUX	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
like	like	ADP	
the	the	DET	Definite=Def|PronType=Art
Metallica	Metallica	PROPN	Number=Sing



### Exporting in the CoNLL-U format


In this part, we are converting the dataset into CoNLL-U format, which is commonly used for linguistic datasets. The CoNLL-U format lets us organize each token in a sentence with additional information, like its lemma, part-of-speech tag, and syntactic dependencies. Using Stanza’s NLP pipeline, we first preprocess every row in the dataset and then annotate it to include these features. In the function sentence_to_conllu_format, we turn each sentence into CoNLL-U structure and add some metadata for the label_sexist.

After every row is converted, we use write_to_conllu function to save each dataset split (train, dev, test) as its own CoNLL-U file. This format is flexible and compatible with multiple NLP tools, which means we can use it in different steps of NLP processing. It also keeps detailed info for each token, which can be very useful for later model training and analysis.

This code was executed on Google Colab using the GPU instance A100

In [23]:
import re
import logging
import pandas as pd
import stanza
from stanza.utils.conll import CoNLL
from tqdm import tqdm
import emoji
import nltk
from nltk.corpus import stopwords

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize custom stopwords
stopwords_set = set(stopwords.words('english'))
stopwords_set -= {'she', 'she\'s', 'herself', 'her', 'hers', 'he', 'himself', 'him', 'his',
                  'yourself', 'yourselves', 'your', 'yours'}
logger.info("Customized stopwords list: %s", sorted(stopwords_set))

# Download and initialize Stanza English pipeline
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,lemma,pos,depparse')

# Load dataset
df = pd.read_csv('../data/edos_labelled_aggregated.csv')


class TextProcessingPipeline:
    """Pipeline for processing text and converting it to CoNLL-U format."""

    @staticmethod
    def replace_emojis_with_description(text):
        """Replace emojis in text with their descriptions."""
        return emoji.demojize(text)

    @staticmethod
    def remove_user_mentions_and_urls(text):
        """Remove user mentions and URLs and from text."""
        text = re.sub(r'\[[A-Z]+\]', '', text)
        return text

    @staticmethod
    def clean_text(text):
        """Clean text by converting to lowercase"""
        text = text.lower()
        text = re.sub(r'[^\w\s.,!?\'"]+', '', text)
        return text

    @staticmethod
    def remove_stopwords(text):
        """Remove stopwords from text."""
        words = nltk.word_tokenize(text)
        filtered_words = [word for word in words if word not in stopwords_set]
        return ' '.join(filtered_words)

    def preprocess_text(self, text):
        """Apply all preprocessing steps to text."""
        text = self.replace_emojis_with_description(text)
        text = self.remove_user_mentions_and_urls(text)
        text = self.clean_text(text)
        text = self.remove_stopwords(text)
        return text

    def sentence_to_conllu_format(self, row):
        """Convert a sentence row to CoNLL-U format."""
        text = self.preprocess_text(row['text'])
        sentence_id = row['rewire_id']
        label_sexist = row['label_sexist']

        doc = nlp(text)

        conllu_format = [f"# sent_id = {sentence_id}",
                         f"# label_sexist = {label_sexist}"]

        for sentence in CoNLL.convert_dict(doc.to_dict()):
            for token in sentence:
                conllu_format.append('\t'.join(str(field) for field in token))

        return '\n'.join(conllu_format)

    def write_to_conllu(self, df, output_file):
        """Write the dataframe to a CoNLL-U formatted file."""
        total_rows = len(df)
        with open(output_file, 'w') as f:
            for _, row in tqdm(df.iterrows(), total=total_rows, desc="Processing rows", ncols=100, leave=True):
                conllu_sentence = self.sentence_to_conllu_format(row)
                f.write(conllu_sentence + '\n\n')
        logger.info("File saved: %s", output_file)


# Instantiating the pipeline
pipeline = TextProcessingPipeline()

# Splitting the data based on 'split' column
data_agg_train = df[df['split'] == 'train'].drop('split', axis=1)
data_agg_dev = df[df['split'] == 'dev'].drop('split', axis=1)
data_agg_test = df[df['split'] == 'test'].drop('split', axis=1)

# Logging the number of samples in each split
logger.info("Number of training samples: %d", data_agg_train.shape[0])
logger.info("Number of validation samples: %d", data_agg_dev.shape[0])
logger.info("Number of test samples: %d", data_agg_test.shape[0])

# Writing each split to a separate CoNLL-U file
pipeline.write_to_conllu(data_agg_train, 'train_sexism_dataset_conllu.conllu')
pipeline.write_to_conllu(data_agg_dev, 'dev_sexism_dataset_conllu.conllu')
pipeline.write_to_conllu(data_agg_test, 'test_sexism_dataset_conllu.conllu')

logger.info(
    "Preprocessed datasets saved as 'train_sexism_dataset_conllu.conllu', 'dev_sexism_dataset_conllu.conllu', and 'test_sexism_dataset_conllu.conllu'.")


[nltk_data] Downloading package punkt to /home/henry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/henry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-11-09 20:30:39,090 - INFO - Customized stopwords list: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'here', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 