# Initial configuration

In [1]:
import os

base_path = "../assets/data/all-news"

splitted_path = os.path.abspath(os.path.join(base_path, "splitted"))
lemmatized_path = os.path.abspath(os.path.join(base_path, "lemmatized"))
transaction_path = os.path.abspath(os.path.join(base_path, "transaction"))

In [2]:
# List splitted files
splitted_files = os.listdir(splitted_path)
splitted_files.sort()

## Text Cleaner Functions

In [3]:
from unidecode import unidecode
import contractions
import inflect
import string
import re

def to_lower(sentence: str):
    return sentence.lower()

def number_to_text(sentence: str):
    p = inflect.engine()
    numbers = re.finditer(r'\d+', sentence)
    for x in reversed([x for x in numbers]):
        number = sentence[x.start():x.end()]

        if number.isdigit():
            word = p.number_to_words(number)
            sentence = sentence[:x.start()] + word + sentence[x.end():]
    return sentence

def remove_numbers(sentence: str):
    return re.sub(r'\d+', '', sentence)

def remove_punctuation(sentence: str, keep_final:bool=False):
    PUNCTUATION = string.punctuation
    if keep_final:
        PUNCTUATION = PUNCTUATION.replace(".", "")
            
    translator = str.maketrans('', '', PUNCTUATION)
    return sentence.translate(translator)

def remove_whitespaces(sentence: str):
    return " ".join(sentence.split())

def expand_contractions(sentence: str):
    return contractions.fix(sentence)

def special_characters_to_ascii(sentence: str):
    return unidecode(sentence)

def drop_links(sentence: str):
    return re.sub(r"http\S+", "", sentence)

In [4]:
def standardize(sentence:str, convert_numbers:bool=True, keep_final:bool=False):
    # Drop urls
    sentence = drop_links(sentence)

    # Treat number converting to text or/and removing
    if convert_numbers:
        sentence = number_to_text(sentence)
    sentence = remove_numbers(sentence)  # if convert mode drop the eventual remain
    
    # Expand contractions
    sentence = expand_contractions(sentence)

    # Remove punctuation
    sentence = remove_punctuation(sentence, keep_final)

    # Convert special characters
    sentence = special_characters_to_ascii(sentence)

    # Normalizing case
    sentence = to_lower(sentence)

    # Remove extra whitespaces
    sentence = remove_whitespaces(sentence)

    return sentence

## Remove Stopwords and Lemmatize

In [5]:
import spacy
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

# Stop words base list
stop_words = stopwords.words('english')

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en", disable=['parser', 'ner'])

def tokenize(sentence: str):
#     return gensim.utils.simple_preprocess(str(sentence), deacc=True)
    return sentence.split(" ")

def remove_stopwords(tokens, stop_words=stop_words):
    return [word for word in tokens if word not in stop_words and len(word) > 3]

def lemmatization(tokens, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = nlp(" ".join(tokens))
#     return [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return [token.lemma_ for token in doc if token.lemma_ != "-PRON-"]

## Other auxiliars functions

In [6]:
import pandas as pd

def generate_id(df: pd.DataFrame):
    df['id'] = df.index
    df['id'] = df['id'].apply(lambda x: '{}/{}'.format(file[:-4], x))
    return df

In [7]:
def save_dataframe(df: pd.DataFrame, filename):   
    if os.path.exists(filename):
        df.to_csv(filename, index=False, mode='a', header=False)
    else:
        df.to_csv(filename, index=False)

# Transactional

In [11]:
import pandas as pd
from tqdm import tqdm

CHUNKSIZE = 1_000

for file in splitted_files[:1]:
    filename = os.path.join(splitted_path, file)
    transactional_fname = os.path.join(transaction_path, file)
    full_fname = os.path.join(lemmatized_path, file)
    
    # Check if files already exists
    if os.path.exists(transactional_fname) and os.path.exists(full_fname):
        print("File already saved:", file)
        continue
    
    # Read data frame in chunks
    chunks = pd.read_csv(
        filename, 
        chunksize=CHUNKSIZE, 
#         nrows=300
    )
    
    print("Handling the file:", file)
    for df in chunks:

        # Fill na with empty
        df["title"] = df["title"].fillna("")
        df["article"] = df["article"].fillna("")

        # Generate id
        df = generate_id(df)   

        # Generating transactional data
        df["article"] = df["article"].str.split(".")

        # Add title as a single transaction
        df['text'] = df.apply(lambda row: [row['title']] + row['article'], axis=1)
        df = df.drop(labels=['title', 'article', 'year_month'], axis=1)

        # Expand text columns by lis elements
        df = df.explode("text").reset_index(drop=True)

        # Clean text columns
        df['text'] = df['text'].apply(standardize)

        # Tokenize text
        df["tokens"] = df['text'].apply(tokenize)

        # Lemmatize
        df["tokens"] = df["tokens"].apply(lemmatization)

        # Removing stop words
        df["tokens"] = df["tokens"].apply(remove_stopwords)

        # Drop empty transactions
        df = df[df["tokens"].apply(len) > 0]

        # Put in text again
        df["text"] = df["tokens"].apply(lambda x: " ".join(x))
        df = df.drop(labels=['tokens'], axis=1)

        # Save transactional data
        save_dataframe(df, transactional_fname)
        
        # Convert to full file and save if
        df = df.groupby(["date", "id"], as_index=False).agg({"text": lambda x: " ".join(list(x))})
        save_dataframe(df, full_fname)
        
        # Clean df for memory saving
        del df        

Handling the file: 2016-01.csv


In [13]:
!mpg123 /home/heladio/Downloads/piseiro.mp3

High Performance MPEG 1.0/2.0/2.5 Audio Player for Layers 1, 2 and 3
	version 1.25.13; written and copyright by Michael Hipp and others
	free software (LGPL) without any warranty but with best wishes
[?25l 
Directory: /home/heladio/Downloads/

Terminal control enabled, press 'h' for listing of keys and functions.

Playing MPEG stream 1 of 1: piseiro.mp3 ...

MPEG 1.0 L III cbr256 44100 j-s

Title:                                   Artist: @XandAviao                     
Comment:                                 Album:  CD Setembro 2020               
Year:    2020                            Genre:  Forró                         

[0:03] Decoding of piseiro.mp3 finished.
[?25h 