In [1]:
import os
import json
import pandas as pd 
import numpy as np

class splittr:
    """
    Construcción de Base de Datos específica y separada
    """
    
    template_list = ['Ancient Aliens', 'Annoying Gamer Kid', 'Ay Si',
                     'Bad Luck Brian', 'Disaster Girl', 'Filosoraptor',
                     'Futurama Fry', 'Matias Prats', 'No me digas',
                     'Prepare Yourself', 'Problems', 'Willy Wonka', 'Yao Wonka']
    
    def __init__(self, folder, json_path, rules_json):
        self.folder = folder
        #self.template_list = template_list
        self.codification = self.load_json(json_path)
        self.rules = self.load_json(rules_json)
        self.text_field = 'texto_no_tilde'
        self.exclude = ['Personalizado']
        self.columns_return = ['meme_id', 'plantilla', 'texto_no_tilde', 'text_1', 'text_2']
        
        
    def load_json(self, json_path):
        """
        Permite la carga de un JSON que contenga los IDs de las plantillas.
        """
        with open(json_path) as f:
            json_file = json.load(f)
        return json_file
    
    def load_and_transform(self):
        file_names = os.listdir(self.folder)
        file_paths = [os.path.join(self.folder, file) for file in file_names if ((file.find('useful') == -1) and (file.find('manual') == -1)) ]

        for file_name, file_path in zip(file_names, file_paths):
            print(f'Initialized {file_name}')
            df = pd.read_csv(file_path)
            df['plantilla'] = df['plantilla'].str.strip() # Erase leading spaces
            df = df[~df.plantilla.isin(self.exclude)] # Filters out that category
            df['meme_id'] = df.plantilla.map(self.codification) # Applies category for model
            df['combined_text'] = df.apply(self.row_applier, axis = 1)
            df[['text_1', 'text_2']] = df.combined_text.str.split(',', expand = True, n = 1) # Splits Text

            return_df = df[self.columns_return]
            # Save useful observations
            save_name_use = 'useful_' + file_name
            save_path_use = os.path.join(self.folder, save_name_use)
            free_of_na = return_df[(~return_df.text_1.str.contains('None')) & (~return_df.text_2.str.contains('None'))].to_csv(save_path_use)

            # Save Manual Observations
            save_name_manual = 'manual_' + file_name
            save_path_manual = os.path.join(self.folder, save_name_manual)
            return_df[(~return_df.meme_id.isnull()) & (return_df.text_1.str.contains('None'))].to_csv(save_path_manual)
            print(f'Successful {file_name}')


    def single_caracter_separator(self, text, sep, append_top = False):
        """
        Implementa una función que separa en base a un sólo caracter
        
        Params
        --------
            text (str): 
                Una cadena de texto a transformar
            
            sep (str):
                Una cadena de texto que separa dos frases
            
            append_top (bool):
                Default = False; 
                Indica si el separador se debe incluir en la primera frase.
                
        Returns
        --------
            text_1 (str):
                Cadena de texto de frase superior
            
            text_2 (str):
                Cadena de texto de frase inferior
        """
        
        text_1, text_2 = text.split(sep)
        if append_top:
            text_1 += sep
        else:
            text_2 += sep
            
        return text_1, text_2
            
        
    
    def down_split(self, text, separator_list):
        """
        Permite realizar la separación de forma "down split". 

        Params
        --------
            text (str):
                Cadena de texto a separar
        
            separator_list (list):
                Una lista que contiene todos los posibles separadores
        """
        for sep in separator_list:
            text_list = text.split(sep)

            if len(text_list) == 2:
                text_1, text_2 = text_list
                text_2 = sep + text_2
                return text_1, text_2

        return None, None

    def top_split(self, text, separator_list):
        """
        Permite realizar la separación de forma "top split". 

        Params
        --------
            text (str):
                Cadena de texto a separar
        
            separator_list (list):
                Una lista que contiene todos los posibles separadores
        """
        for sep in separator_list:
            text_list = text.split(sep)

            if len(text_list) == 2:
                text_1, text_2 = text_list
                text_1 = text_1 + sep
                return text_1, text_2

        return None, None

    def row_applier(self, row):
        """
        Permite aplicar una función de separación basado en reglas.
        """
        text = str(row.texto_no_tilde).replace(',', '')
        template =  ' '.join(row.plantilla.split())
        rule = self.rules.get(template, {None: None})
        fn = list(rule.keys())[0]
        separators = rule.get(fn, None)

        if fn == 'down_split':
            text_1, text_2 = self.down_split(text, separators)
        elif fn == 'top_split':
            text_1, text_2 = self.top_split(text, separators)
        elif fn == 'second_down_split':
            text_1, text_2 = self.second_down_split(text, separators)
        else:
            text_1 = None
            text_2 = None
        return f'{text_1},{text_2}'

    def second_down_split(self, text, separator_list):
        """
        Permite realizar la separación de la segunda "down split". 

        Params
        --------
            text (str):
                Cadena de texto a separar
        
            separator_list (list):
                Una lista que contiene todos los posibles separadores
        """

        for sep in separator_list:
            text_list = text.split(sep)

            if len(text_list) == 3:
                text_1, text_2, text_3 = text_list
                top_row = text_1 + sep + text_2
                bot_row = sep + text_3
                
                return top_row, bot_row

        return None, None

In [2]:
split = splittr(folder = '../data/bases_no_groserias', json_path = './template_id.json', rules_json = './rules.json')

In [3]:
split.load_and_transform()

Initialized scrapymemes33m_no_tilde.csv
Successful scrapymemes33m_no_tilde.csv
Initialized scrapymemes7m_no_tilde.csv
Successful scrapymemes7m_no_tilde.csv
Initialized scrapymemes16m_no_tilde.csv
Successful scrapymemes16m_no_tilde.csv
Initialized scrapymemes32m_no_tilde.csv
Successful scrapymemes32m_no_tilde.csv
Initialized scrapymemes20m_no_tilde.csv
Successful scrapymemes20m_no_tilde.csv
Initialized scrapymemes6m_no_tilde.csv
Successful scrapymemes6m_no_tilde.csv
Initialized scrapymemes1m_no_tilde.csv
Successful scrapymemes1m_no_tilde.csv
Initialized scrapymemes24m_no_tilde.csv
Successful scrapymemes24m_no_tilde.csv
Initialized scrapymemes10m_no_tilde.csv
Successful scrapymemes10m_no_tilde.csv
Initialized scrapymemes9m_no_tilde.csv
Successful scrapymemes9m_no_tilde.csv
Initialized scrapymemes5m_no_tilde.csv
Successful scrapymemes5m_no_tilde.csv
Initialized scrapymemes8m_no_tilde.csv
Successful scrapymemes8m_no_tilde.csv
Initialized scrapymemes28m_no_tilde.csv
Successful scrapymemes28