# Definición de clase de reducción de Abstracts innecesarios con expresiones regulares sobre palabras clave deseadas

In [11]:
import re
import pandas as pd

class Analyzer:
    def __init__(self, df, keywords_regex):
        self.df = df
        self.keywords_regex = keywords_regex
    
    def clean_dataframe(self):
        # Conservar solo la columna 'Abstract'
        self.df = self.df[['Abstract']]
        # Convertir a minúsculas
        self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
    
    def find_keywords(self):
        # Encontrar las coincidencias para cada palabra clave
        keyword_matches = {}
        for keyword, regex in self.keywords_regex.items():
            if regex is not None:  # Si no es None, buscar coincidencias
                keyword_matches[keyword] = self.df['Abstract'].apply(lambda x: bool(re.search(regex, x)))
            else:
                keyword_matches[keyword] = True  # Si es None, lo tratamos como True (todas las filas pasan)

        # Inicialmente, todas las filas son válidas
        valid_rows = self.df
        # Filtrar las filas que contienen todas las palabras clave
        for match in keyword_matches.values():
            if isinstance(match, pd.Series):  # Solo filtrar si hay un patrón definido
                valid_rows = valid_rows[match]
        
        return valid_rows


## Temas y palabras clave con el que se usará la clase 

In [12]:
# Definir los archivos CSV y las palabras clave correspondientes
topics = [
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\BehavioralFinance.csv',
        'keywords': {
            'first_word': r'behavioral',
            'second_word': r'finance'
        },
        'output': 'BehavioralFinance_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\FinancialDerivatives.csv',
        'keywords': {
            'first_word': r'financial',
            'second_word': r'derivative( |s)?'
        },
        'output': 'FinancialDerivatives_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\AssetPricingModels.csv',
        'keywords': {
            'first_word': r'asset',
            'second_word': r'pricing',
            'third_word': r'model( |s)?'
        },
        'output': 'AssetPricingModels_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\CorporateFinance.csv',
        'keywords': {
            'first_word': r'corporate',
            'second_word': r'finance'
        },
        'output': 'CorporateFinance_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\SustainableFinance.csv',
        'keywords': {
            'first_word': r'sustainable',
            'second_word': r'finance'
        },
        'output': 'SustainableFinance_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\PortfolioOptimization.csv',
        'keywords': {
            'first_word': r'portfolio',
            'second_word': r'optimization'
        },
        'output': 'PortfolioOptimization_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\FinancialEngineering.csv',
        'keywords': {
            'first_word': r'financial',
            'second_word': r'engineering'
        },
        'output': 'FinancialEngineering_touse.csv'
    },
    {
        'filename': 'D:\\CodeProjects\\QuantFinanceResearch\\CSVs\\Financial_Risk_combined.csv',
        'keywords': {
            'first_word': r'financial',
            'second_word': r'risk'
        },
        'output': 'FinancialRisk_touse.csv'
    }
]



## Implementación de la clase sobre los temas y palabras clave

In [13]:
# Procesar cada tema
for topic in topics:
    # Leer el CSV
    df = pd.read_csv(topic['filename'])
    
    # Crear una instancia de Analyzer y limpiar el DataFrame
    analyzer = Analyzer(df, topic['keywords'])
    analyzer.clean_dataframe()
    
    # Obtener las filas filtradas
    df_definitivo = analyzer.find_keywords()
    
    # Guardar el DataFrame filtrado en el archivo CSV correspondiente
    df_definitivo.to_csv(topic['output'], index=False)
    
    # Imprimir información de resultados
    print(f'Procesado {topic["filename"]} -> {topic["output"]}')
    print(df_definitivo.head(3))
    print(f'Tiene {df_definitivo.shape[0]} filas y {df_definitivo.shape[1]} columnas')
    print('---------------------------------------')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\BehavioralFinance.csv -> BehavioralFinance_touse.csv
                                            Abstract
0  we address the stock predictability puzzle, a ...
1  this study examines the effects of religiosity...
2  when the environmental performance is below th...
Tiene 2114 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\FinancialDerivatives.csv -> FinancialDerivatives_touse.csv
                                             Abstract
2   blockchain technology is currently revolutioni...
7   we introduce a novel, time-efficient adaptive ...
10  as a type of financial derivative, the price f...
Tiene 4373 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\AssetPricingModels.csv -> AssetPricingModels_touse.csv
                                            Abstract
0  a low frequency factor model regression uses c...
1  we develop a structural credit risk model, whi...
2  drawing upon accounting-based asset pricing mo...
Tiene 7136 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\CorporateFinance.csv -> CorporateFinance_touse.csv
                                            Abstract
0  in 2017, china introduced the pilot zones for ...
4  the integration of artificial intelligence (ai...
5  in the digital economy era, the link between d...
Tiene 6329 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\SustainableFinance.csv -> SustainableFinance_touse.csv
                                            Abstract
0  governments in asian economies are under immen...
5  the aggravation of the global warming crisis, ...
7  developing university governance capacity for ...
Tiene 6794 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\PortfolioOptimization.csv -> PortfolioOptimization_touse.csv
                                            Abstract
0  pension funds are crucial in supporting enviro...
1  grouping stocks within an index based on their...
2  devising an efficient exploration of the searc...
Tiene 8493 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\FinancialEngineering.csv -> FinancialEngineering_touse.csv
                                            Abstract
1  exchange rate forecasting has a significant im...
3  carbon capture utilisation and storage (ccus) ...
4  cement, aggregate, and, in some situations, ad...
Tiene 9217 filas y 1 columnas
---------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Abstract'] = self.df['Abstract'].apply(lambda x: x.lower())
  valid_rows = valid_rows[match]


Procesado D:\CodeProjects\QuantFinanceResearch\CSVs\Financial_Risk_combined.csv -> FinancialRisk_touse.csv
                                            Abstract
2  managing a profitable commercial agricultural ...
3  the growth of the carbon market has been fast;...
5  a frequent refrain during recent debates on we...
Tiene 92762 filas y 1 columnas
---------------------------------------
