# Milestone 1: preprocessing
___

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [11]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sofija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Sofija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
data = pd.read_csv('../data/edos_labelled_individual_annotations.csv')
print("Columns:", list(data.columns))
print("Shape:", data.shape)
data.head()

Columns: ['rewire_id', 'text', 'annotator', 'label_sexist', 'label_category', 'label_vector', 'split']
Shape: (60000, 7)


Unnamed: 0,rewire_id,text,annotator,label_sexist,label_category,label_vector,split
0,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,17,sexist,2. derogation,2.2 aggressive and emotive attacks,train
1,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,2,sexist,2. derogation,2.2 aggressive and emotive attacks,train
2,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,6,not sexist,none,none,train
3,sexism2022_english-1,"What do you guys think about female ""incels""? ...",17,not sexist,none,none,train
4,sexism2022_english-1,"What do you guys think about female ""incels""? ...",15,not sexist,none,none,train


The dataset contains a more fine-grained sexism detection, but we're working only with the `label_sexist`.

In [3]:
data = data.drop(columns=['label_category', 'label_vector'])
data.head()

Unnamed: 0,rewire_id,text,annotator,label_sexist,split
0,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,17,sexist,train
1,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,2,sexist,train
2,sexism2022_english-0,[USER] I wonder what keeps that witch looking ...,6,not sexist,train
3,sexism2022_english-1,"What do you guys think about female ""incels""? ...",17,not sexist,train
4,sexism2022_english-1,"What do you guys think about female ""incels""? ...",15,not sexist,train


### Exploratory analysis

In [4]:
print(f"There are: {len(data['annotator'].unique())} different annotators.")
print("Annotator IDs:", sorted(data['annotator'].unique()))

There are: 19 different annotators.
Annotator IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


Each of the 20000 unique comments was annotated by 3 different annotators. In 4444 cases, annotators reached a 2/3 agreement rather than full 3/3.

In [5]:
print(f"There are: {len(data['rewire_id'].unique())} different comments annotated in total.")
print("Minimum number of annotations for a comment:", data['rewire_id'].value_counts().min())
print("Maximum number of annotations for a comment:", data['rewire_id'].value_counts().max())

There are: 20000 different comments annotated in total.
Minimum number of annotations for a comment: 3
Maximum number of annotations for a comment: 3


In [6]:
unique_label_counts = data.groupby('rewire_id')['label_sexist'].nunique() # 1 (3/3 agreement) or 2 (2/3 agreement)
agreement_2_3_count = (unique_label_counts != 1).sum() # number of comments where annotators have 2/3 agreement

print("Number of 'rewire_id' entries (comments) with 2/3 agreement among annotators:", agreement_2_3_count)
print(unique_label_counts)

Number of 'rewire_id' entries (comments) with 2/3 agreement among annotators: 4444
rewire_id
sexism2022_english-0       2
sexism2022_english-1       1
sexism2022_english-10      2
sexism2022_english-100     1
sexism2022_english-1000    1
                          ..
sexism2022_english-9995    1
sexism2022_english-9996    2
sexism2022_english-9997    1
sexism2022_english-9998    2
sexism2022_english-9999    2
Name: label_sexist, Length: 20000, dtype: int64


### Aggregated dataset

In [7]:
data_agg = pd.read_csv('../data/edos_labelled_aggregated.csv')
data_agg = data_agg.drop(columns=['label_category', 'label_vector'])
print("Columns:", list(data_agg.columns))
print("Shape:", data_agg.shape)
data_agg

Columns: ['rewire_id', 'text', 'label_sexist', 'split']
Shape: (20000, 4)


Unnamed: 0,rewire_id,text,label_sexist,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,dev
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,train
3,sexism2022_english-13021,woman?,not sexist,train
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,dev
...,...,...,...,...
19995,sexism2022_english-5228,girls really get fucked almost every weekend ?,not sexist,train
19996,sexism2022_english-10140,The hatred for moslems are Reasonable and Just...,not sexist,train
19997,sexism2022_english-9726,Now this is a woman who gets it. 👆,not sexist,train
19998,sexism2022_english-13365,“American Idol” finalist [USER] said nothing i...,not sexist,train


Conclusions drawn using regular expressions:

- `[URL]`, `[USER]` are placeholders used by dataset authors instead of actual URLs and real usernames
- female related nouns and pronouns are more frequent than male
- hashtags `#` often used
- huge amount of profanities

In [8]:
def count_patterns(pattern, data):
    return Counter(match for text in data.text for match in re.findall(pattern, text)).most_common()

In [33]:
count_patterns(r'\[[A-Z]+\]', data_agg) # catching: [USER], [URL]
# count_patterns(r'\b(she|her|wom[ae]n|female|girl|lady)\b', data_agg) # female related nouns, pronouns etc.
# count_patterns(r'\b(he|him|his|m[ae]n|male|boy|guy|dude)\b', data_agg) # male related nouns, pronouns etc.
# count_patterns(r'#\w+', data_agg) # hashtag
# count_patterns(r'\b(fuck|shit|damn|asshole|bitch|slut)\b', data_agg) # profanities

[('[URL]', 2478), ('[USER]', 1355), ('[K]', 1), ('[DJT]', 1)]

### Text normalization

What are the most common words in our concatenated text?

Examining the list of the most frequent words after tokenization and punctuation removal (`[]` were also highly frequent), we found that, alongside expected stopwords (`a`, `the` etc.), there is notable frequency of female-related nouns/pronouns (`her`, `she`, `women`).

In [25]:
words = [word for text in data_agg['text'] for word in nltk.word_tokenize(text)]
words = [word for word in words if re.match(r'\w', word)] # exclude punctuation
print("Total number of words found after tokenization and punctuation removal:", len(words))
print("Top 20 most common words:", Counter(words).most_common(20))

Total number of words found after tokenization and punctuation removal: 470476
Top 20 most common words: [('the', 12658), ('a', 11891), ('to', 11636), ('I', 8603), ('and', 8579), ('you', 6842), ('is', 6714), ('of', 6547), ('her', 5799), ('that', 5306), ('she', 4908), ('in', 4742), ('it', 4509), ('women', 4146), ("n't", 3965), ('for', 3918), ('are', 3710), ('with', 3373), ('be', 3029), ('on', 3011)]


Out of curiosity, let’s find the context surrounding the first few occurrences of some specific token of interest.

In [38]:
def find_contexts(token, data, context_size=50, limit=10):
    """
    Finds the first 'limit' occurrences of a given 'token' in dataframe 'data'
    with a 'text' column, capturing 'context_size' characters around each occurrence.
    Context: token position +- 'context_size' chars around.
    """
    contexts = []

    for text in data['text']:
        for match in re.finditer(rf'\b{token}\b', text):
            start, end = match.span()
            # get the context around the match
            context = text[max(0, start - context_size):min(len(text), end + context_size)] # +- context_size chars
            contexts.append(context)
            
            if len(contexts) >= limit:
                return contexts[:limit]

    return contexts[:limit]

find_contexts("women", data_agg)
# find_contexts("she", data_agg)

['e begins to hit the wall and some hotter, younger women enters the picture, it’s time for impulse control',
 'eed to stop referring to all men as kings and all women as queens, this shit is getting out of hand.',
 ' like 90% of people i know IRL it just so happens women are disgusted by me-and i dont even do it IRL ?',
 "or Assault and based on his claim that he's raped women, he should be investigated. His bragging could be",
 'Yes, normal women want to be dominated. Social scientists call it f',
 'cientists call it female hypergamy. And it is why women make bad bosses for men, generally speaking.',
 ' you have few options. You drop a few points with women doing that. A guy with options doesn’t give a shi',
 'The she-god mentality of women is going down the drain. Men make the world work ',
 "nt pussy. Sexbots will be realistic very soon and women's days of pussy rule will be over. &#x200B;",
 ' Europe (France, Germany etc.) is easier and that women do approach men. Here they never 

##### Stopword removal

`she`, `herself` should be excluded from stopwords, because we assume that those pronouns provide relevant information for our task and preserving them will be beneficial.

In [61]:
stopwords_set = set(stopwords.words('english'))
stopwords_set = stopwords_set - {'she', 'she\'s', 'herself'}
stopwords_set

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 '

After stopword filtering and analyzing the most commonly occuring words, we can observe female-related nouns/pronounces, `URL`, `USER` tokens, as well as some profanities.

In [63]:
words = [word.lower() for word in words if word.lower() not in stopwords_set]
print("Total number of words found after tokenization, punctuation removal and stopword filtering:", len(words))
print("Top 20 most common words:", Counter(words).most_common(20))

Total number of words found after tokenization, punctuation removal and stopword filtering: 253781
Top 20 most common words: [('she', 6047), ('women', 4427), ("n't", 3982), ('url', 2480), ('like', 2424), ('get', 1713), ('woman', 1698), ('would', 1546), ('men', 1503), ('user', 1362), ('one', 1151), ('girls', 1126), ('girl', 1090), ('fuck', 1068), ('want', 995), ('think', 947), ('shit', 940), ('female', 935), ('people', 934), ('know', 924)]


##### Lemmatization

### Exporting in the CoNLL format
