In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertConfig, AutoConfig
import numpy as np
import evaluate
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
df_dev = pd.read_csv('dataset/subtaskB_train.csv').drop(columns=['Id', 'topic'])
df_test = pd.read_csv('dataset/subtaskB_test.csv').drop(columns=['Id'])

In [23]:
df_dev.shape

(810, 2)

In [24]:
df_test.shape

(300, 1)

In [15]:
import re

def text_cleaner(text, lowercase=False, r_emoticons=True):

    clean_text = re.sub(r'\[.*?\]', '', text)
    clean_text = re.sub(r'[\n\r_]', ' ', clean_text) # remove new line, tab, return, space
    clean_text = re.sub(r'\*+?', '', clean_text)
    clean_text = re.sub(r'\d\d/\d\d/\d\d', '', clean_text) # remove date
    
    if r_emoticons:
        clean_text = re.sub(r'[^a-zA-Z0-9 \\/\.\'\",:-;!\?\(\)-Ã¬Ã¨Ã©Ã²Ã³Ã Ã¹Ãˆâ‚¬\$&Â£%=]', '', clean_text)
    
    if lowercase:
        clean_text = clean_text.lower()
    
    
    return clean_text.strip()

In [16]:
df_dev_cased = df_dev.copy()
df_test_cased = df_test.copy()

In [17]:
df_dev_cased['comment_text'] = df_dev_cased['comment_text'].apply(text_cleaner)
df_test_cased['comment_text'] = df_test_cased['comment_text'].apply(text_cleaner)

In [18]:
df_dev_cased.comment_text

0      Siamo davanti ad una prova, e non solo di quoz...
1      La storia dei 2 bimbi di Bergamo - vaccini: qu...
2      L'avete capito che non toglieranno il green pa...
3      Quindi  la farsa dello spazio e della terra a ...
4      In breve tempo, per accedere a Internet, sarÃ  ...
                             ...                        
805    Incredibile!!!! EMA, Agenzia Europea del Farma...
806    Non ci saranno colloqui di pace con la Russia ...
807    L'atmosfera Ã¨ uno "scudo protettivo" che avvol...
808    OTTIMA NOTIZIA! Due ragioni per cui le Ã©lite n...
809                Ma Ã¨ perfet allineata o meglio piatta
Name: comment_text, Length: 810, dtype: object

Il cleaning ha generato stringhe vuote?

In [20]:
np.where(df_dev_cased.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [21]:
np.where(df_test_cased.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [27]:
#df_dev.loc[1497]

comment_text      ðŸ‡®ðŸ‡¹ 
conspiratorial      0
Name: 1497, dtype: object

In [28]:
#df_dev_cased.loc[1497]

comment_text       
conspiratorial    0
Name: 1497, dtype: object

In [30]:
#df_dev_cased.drop(index=[1388, 1497], inplace=True)

In [22]:
df_dev.shape, df_dev_cased.shape # tutto ok

((810, 2), (810, 2))

In [25]:
df_test_cased.to_csv('dataset/subtaskB_clean_test_cased.csv', index=False)
df_dev_cased.to_csv('dataset/subtaskB_clean_dev_cased.csv', index=False)