In [None]:
!pip freeze

In [None]:
from googletrans import Translator
from tqdm.auto import tqdm
import pickle
import pandas as pd

In [None]:
# Read the dataset
dataset = pd.read_csv('CleanTranslateDataset/final_dataset.csv', delimiter=',', encoding='ISO-8859-1')
dataset

In [None]:
translator = Translator()

"""
This function autodetects the language of a comment and translates it to english
"""
def detect_translate(comment):
    try:
        translation = translator.translate(comment, dest = 'en').text
    except:
        translation = "ERROR"
        
    return translation

"""
This function iterates over all the rows of a dataset and translate each comment from its language to english
"""
def translate_dataset(df):
    df = df.copy()
    lang_accuracy = 0

    # iterate over all rows
    for index, row in tqdm(df.iterrows(), total=len(df)):
        row['comment'] = detect_translate(row['comment'])
        
    return df

In [None]:
translated_dataset = translate_dataset(dataset)
#translated_dataset.to_csv("final_dataset_translated.csv", index=False)

In [None]:
translated_dataset

In [None]:
# Saving the complete dataset to a pickle object in order to access to the data later
"""with open('final_dataset_translated.pickle', 'wb') as handle:
    pickle.dump(translated_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)"""

In [None]:
"""# Reading it once it's already stored
with open('final_dataset_translated.pickle', 'rb') as handle:
    translated_dataset = pickle.load(handle)
    
print(all(translated_dataset == read_translated_dataset))
translated_dataset"""

Shuffleing the data

In [None]:
translated_dataset = translated_dataset.sample(frac = 1)
translated_dataset

Remove all the rows that contain the string "ERROR"

In [None]:
translated_dataset = translated_dataset[~(translated_dataset == 'ERROR').any(axis=1)]

In [None]:
translated_dataset

In [None]:
#translated_dataset.to_csv("final_dataset_translated_shuffle.csv", index=False)

At this moment of the preprocessing, I have used an external tool such as GoogleSheets with the help of its functions for filtering and the application of formulas in order to correctly clean the dataset. I also have labeled 1000 comments with a 1 meaning "not ofensive comment" and with a 0 meaning "offensive comment".

In [None]:
# Read the dataset after labeling an amount of data
labeled_dataset = pd.read_csv('CleanTranslateDataset/final_dataset_labeled.csv',delimiter=',', encoding='ISO-8859-1')

In [None]:
# Get the numeric values of the sentiment column and setting the non numeric ones to NaN
numeric_col = pd.to_numeric(labeled_dataset['sentiment'], errors='coerce')

# Find the last row that contains a float in the 'sentiment' column
last_float_row = labeled_dataset[ pd.notna(numeric_col) & (numeric_col.dtype == float) ].tail(1)

# Print the last row that contains a float in the 'sentiment' column
print(f"The last row that contains a float in the sentiment column is:\n{last_float_row}")

In [None]:
labeled_dataset

In [None]:
# Create the v1 of the full cleaned dataset that then we will use for training the model
cleaned_dataset = labeled_dataset[:1420]

In [None]:
cleaned_dataset.tail(13)

In [None]:
cleaned_dataset['sentiment'] = cleaned_dataset['sentiment'].astype(int)
cleaned_dataset.tail(13)

Here I realised that there is an error, we have values with a number different to 0 or 1. Let's check how many there are.

In [None]:
# After realising that there were values that weren't 0 or 1, checking them
cleaned_dataset_check = cleaned_dataset[(cleaned_dataset['sentiment'] != 0 ) &  (cleaned_dataset['sentiment'] != 1 )]
cleaned_dataset_check

In [None]:
# After reading the comment, just changing from 2 (only different number from 0 or 1) to 1 (non ofensive)
cleaned_dataset.loc[cleaned_dataset['sentiment'] == 2, 'sentiment'] = int(1)
cleaned_dataset.tail(13)

In [None]:
# Checking again if there are still some different numbers different to 1 or 0
cleaned_dataset_check = cleaned_dataset[(cleaned_dataset['sentiment'] != 0 ) &  (cleaned_dataset['sentiment'] != 1 )]
cleaned_dataset_check

In [None]:
labeled_dataset

Finally we can save the dataset in order to use it to train or model

In [None]:
cleaned_dataset.to_csv("cleaned_dataset.csv", index=False)