# Initial configuration

In [1]:
import os

base_path = "../assets/data/nips"

raw_file = os.path.join(os.path.join(base_path, "papers.csv.zip"))
transaction_file = os.path.abspath(os.path.join(base_path, "transaction/papers.csv"))

## Text Cleaner Functions

In [2]:
from unidecode import unidecode
import contractions
import inflect
import string
import re

def to_lower(sentence: str):
    return sentence.lower()

def number_to_text(sentence: str):
    p = inflect.engine()
    numbers = re.finditer(r'\d+', sentence)
    for x in reversed([x for x in numbers]):
        number = sentence[x.start():x.end()]

        if number.isdigit():
            word = p.number_to_words(number)
            sentence = sentence[:x.start()] + word + sentence[x.end():]
    return sentence

def remove_numbers(sentence: str):
    return re.sub(r'\d+', '', sentence)

def remove_punctuation(sentence: str, keep_final:bool=False):
    PUNCTUATION = string.punctuation
    if keep_final:
        PUNCTUATION = PUNCTUATION.replace(".", "")
            
    translator = str.maketrans(PUNCTUATION, " "*len(PUNCTUATION))
    return sentence.translate(translator)

def remove_whitespaces(sentence: str):
    return " ".join(sentence.split())

def expand_contractions(sentence: str):
    return contractions.fix(sentence)

def special_characters_to_ascii(sentence: str):
    return unidecode(sentence)

def drop_links(sentence: str):
    return re.sub(r"http\S+", "", sentence)

In [4]:
def standardize(sentence:str, convert_numbers:bool=True, keep_final:bool=False):
    # Drop urls
    sentence = drop_links(sentence)

    # Treat number converting to text or/and removing
    if convert_numbers:
        sentence = number_to_text(sentence)
    sentence = remove_numbers(sentence)  # if convert mode drop the eventual remain
    
    # Expand contractions
    sentence = expand_contractions(sentence)

    # Remove punctuation
    sentence = remove_punctuation(sentence, keep_final)

    # Convert special characters
    sentence = special_characters_to_ascii(sentence)

    # Normalizing case
    sentence = to_lower(sentence)

    # Remove extra whitespaces
    sentence = remove_whitespaces(sentence)

    return sentence

## Remove Stopwords and Lemmatize

In [5]:
import spacy
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

# Stop words base list
stop_words = stopwords.words('english')

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# nlp = spacy.load("en", disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

def tokenize(sentence: str):
#     return gensim.utils.simple_preprocess(str(sentence), deacc=True)
    return sentence.split(" ")

def remove_stopwords(tokens, stop_words=stop_words):
    return [word for word in tokens if word not in stop_words and len(word) > 3]

def lemmatization(tokens, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = nlp(" ".join(tokens))
#     return [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return [token.lemma_ for token in doc if token.lemma_ != "-PRON-"]

## Other auxiliars functions

In [None]:
# def save_dataframe(df: pd.DataFrame, filename):   
#     if os.path.exists(filename):
#         df.to_csv(filename, index=False, mode='a', header=False)
#     else:
#         df.to_csv(filename, index=False)

# Transactional data

In [22]:
import pandas as pd
from tqdm import tqdm

CHUNKSIZE = 500

# Import dataframe into chunks
chunks = pd.read_csv(
    raw_file, 
    compression="zip",
    usecols=["id", "year", "title", "abstract", "paper_text"],
    chunksize=CHUNKSIZE)

for df in tqdm(chunks):
    
    # Clean missing abstract
    df.loc[df.abstract == "Abstract Missing", "abstract"] = ""

    # Fill na with empty
    df["title"] = df["title"].fillna("")
    df["abstract"] = df["abstract"].fillna("")  
    df["paper_text"] = df["paper_text"].fillna("")  

    # Generating transactional data
    df["abstract"] = df["abstract"].str.split(".")
    df["paper_text"] = df["paper_text"].str.split(".")

    # Add title as a single transaction
    df['text'] = df.apply(lambda row: [row['title']] + row['abstract'] + row["paper_text"], axis=1)
    df = df.drop(labels=['title', 'abstract', 'paper_text'], axis=1)

    # Expand text columns by lis elements
    df = df.explode("text").reset_index(drop=True)

    # Clean text columns
    df['text'] = df['text'].apply(standardize, convert_numbers=False)

    # Tokenize text
    df["tokens"] = df['text'].apply(tokenize)

    # Lemmatize
    df["tokens"] = df["tokens"].apply(lemmatization)

    # Removing stop words
    df["tokens"] = df["tokens"].apply(remove_stopwords)

    # Drop empty transactions
    df = df[df["tokens"].apply(len) > 0]

    # Put in text again
    df["text"] = df["tokens"].apply(lambda x: " ".join(x))
    df = df.drop(labels=['tokens'], axis=1)

    # Save transactional data
    save_dataframe(df, transaction_file)

0it [00:00, ?it/s]


NameError: name 'save_dataframe' is not defined

In [None]:
!mpg123 /home/heladio/Downloads/piseiro.mp3