# Clean of Text Data
El siguiente notebook presenta el proceso de limpieza de los datos de texto

In [4]:
DATA_SRC_PATH = '../data/text/cleaned/reduced_final.csv'
DATA_DST_PATH = '../data/text/cleaned/reduced_final_eng_clean.csv'

In [5]:
import pandas as pd
import langid

In [6]:
def remove_long_sentences(df: pd.DataFrame, max_len=300) -> pd.DataFrame:
    """
    Removes long strings from dataframe
    :param df: The dataframe that has a column named 'text' which contains the strings to remove
    :param max_len: The length of the biggest string allowed
    :return: The dataframe with the longest strings removed
    """
    print('-'*20, '\nRemoving long sentences')
    df['len'] = df['text'].apply(len)
    df = df[ (df['len'] <= max_len)]
    df.drop(columns=['len'])
    return df

In [7]:
def get_lang(sentence: str):
    lang = langid.classify(sentence)[0]
    return lang

def remove_non_lang_sentences(df: pd.DataFrame, lang='en') -> pd.DataFrame:
    print('-'*20, '\nRemoving non eng sentences')
    df['lang'] = df['text'].apply(get_lang)
    df = df[ (df['lang'] == lang) ]
    df.drop(columns=['lang'])
    return df

In [8]:
def clean_data(df: pd.DataFrame):
    print('-'*20, '\nDelete nans')
    df.dropna(inplace=True)
    df = remove_long_sentences(df)
    df = remove_non_lang_sentences(df)
    return df

In [9]:
data = pd.read_csv(DATA_SRC_PATH)
data = clean_data(data)

-------------------- 
Delete nans
-------------------- 
Removing long sentences
-------------------- 
Removing non eng sentences


KeyboardInterrupt: 

## Export Dataset

In [None]:
from pathlib import Path

filepath = Path(DATA_DST_PATH)
filepath.parent.mkdir(parents=True, exist_ok=True)
data.to_csv(filepath, index=False)