# Procesamiento de texto

### Instalar librerías

In [5]:
!pip install googletrans==4.0.0-rc1
!pip install nltk
!pip install --upgrade spacy torch
!python -m spacy download en_core_web_sm

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
     ---------------------------------------- 0.0/55.1 kB ? eta -:--:--
     ------- -------------------------------- 10.2/55.1 kB ? eta -:--:--
     ----------------------------------- -- 51.2/55.1 kB 525.1 kB/s eta 0:00:01
     -------------------------------------- 55.1/55.1 kB 477.2 kB/s eta 0:00:00
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     -- ------------------------------------- 0.1/1.5 MB 4.5 MB/s eta 0:00:01
     -------- ------------------------------- 0.3/1.5 MB 4.2 MB/s eta 0:00:01
     --------------- ------------------------ 

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 165.2 kB/s eta 0:01:18
     --------------------------------------- 0.0/12.8 MB 187.9 kB/s eta 0:01:08
     --------------------------------------- 0.0/12.8 MB 219.4 kB/s eta 0:00:59
     --------------------------------------- 0.1/12.8 MB 357.2 kB/s eta 0:00:36
     --------------------------------------- 0.1/12.8 MB 655.8 kB/s eta 0:00:20
      --------------------------------------- 0.3/12.8 MB 1.0 MB/s eta 0:00:13
     - -------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.4/12.8 MB 1.1 MB/s eta 0:00:12
     - ------------------------------------

### Datos COPA

In [6]:
#Importar datos

import pandas as pd
copa = pd.read_csv('listArticulos.csv', encoding='ISO-8859-1', delimiter=';')
copa = copa[["title", "classification"]]
copa = copa.dropna()
copa.head()

Unnamed: 0,title,classification
0,Cochlear Implants Versus Hearing Aids in a Mid...,health systems
1,Cochlear Implants versus hearing aids in a mid...,health systems
2,Economic evaluation of Kangaroo Mother Care: c...,health systems
3,Exact bidirectional algorithm for the least ex...,transportation systems
4,Optimal waterflooding management using an embe...,energy systems


In [12]:
# Pasar todas las observaciones al inglés

from googletrans import Translator

def traducir_texto(texto, idioma):
    translator = Translator()
    translated = translator.translate(texto, dest=idioma)
    return translated.text

for i in copa.index:
  dato = copa.loc[i, "title"]
  traducido = traducir_texto(dato, "en")
  copa.loc[i, "title"] = traducido

copa.head()

Unnamed: 0,title,classification
0,Cochlear Implants Versus Hearing Aids in a Mid...,health systems
1,Cochlear Implants versus hearing aids in a mid...,health systems
2,Economic evaluation of Kangaroo Mother Care: c...,health systems
3,Exact bidirectional algorithm for the least ex...,transportation systems
4,Optimal waterflooding management using an embe...,energy systems


In [13]:
copa_procesado = copa.copy()

# Eliminar carateres especiales y cambiar mayúsculas a minúsculas

import re
def caracteres_especiales(texto):
    muestra = r'[^a-zA-Z0-9\s]'  # Incluye todo excepto letras, números y espacios en blanco
    obslimpia = re.sub(muestra, ' ', texto)
    obslimpia = re.sub(r'\s+', ' ', obslimpia)  
    return obslimpia

for i in copa_procesado.index:
  texto = copa_procesado.loc[i, "title"]
  texto = caracteres_especiales(texto) # Quitar caracteres especiales
  texto = texto.lower() # Estandarizar a minúsculas
  copa_procesado.loc[i, "title"] = texto
    
copa_procesado.head()

Unnamed: 0,title,classification
0,cochlear implants versus hearing aids in a mid...,health systems
1,cochlear implants versus hearing aids in a mid...,health systems
2,economic evaluation of kangaroo mother care co...,health systems
3,exact bidirectional algorithm for the least ex...,transportation systems
4,optimal waterflooding management using an embe...,energy systems


In [14]:
# Eliminar stop words y lematizar palabras

import spacy
nlp = spacy.load("en_core_web_sm") #Modelo de lenguaje en inglés de spaCy

for i in copa_procesado.index:
  doc = nlp(copa_procesado.loc[i, "title"])
  palabras = [token.lemma_ for token in doc if not token.is_stop] 
  texto = ' '.join(palabras)
  copa_procesado.loc[i, "title"] = texto

copa_procesado.head()

Unnamed: 0,title,classification
0,cochlear implant versus hear aid middle income...,health systems
1,cochlear implant versus hear aid middle income...,health systems
2,economic evaluation kangaroo mother care cost ...,health systems
3,exact bidirectional algorithm expect travel ti...,transportation systems
4,optimal waterflooding management embed predict...,energy systems


In [18]:
# Quitar duplicados

copa_duplicados = copa_procesado[copa_procesado["title"].duplicated()]
copa1 = copa_procesado.drop_duplicates(subset='title', keep='first')

print("El total de duplicados es ", len(copa_duplicados))
print("El nuevo total de observaciones de COPA son ", len(copa1))
print(copa1['classification'].value_counts())

El total de duplicados es  1
El nuevo total de observaciones de COPA son  117
classification
urban systems                    32
transportation systems           23
health systems                   19
production systems               17
energy systems                    9
agricultural systems              7
financial engineering systems     7
sustainable systems               3
Name: count, dtype: int64


### Datos Scopus

In [19]:
scopus = pd.read_csv('scopus_totales.csv', encoding='ISO-8859-1', delimiter=',')
columnas = list(scopus.columns)
columnas[0] = "title"
scopus.columns = columnas
scopus.head(15)

Unnamed: 0,title,classification
0,Assessing multivariate effect of best manageme...,agricultural systems
1,Predicting daily solar radiation using a novel...,agricultural systems
2,Multi-Hypothesis Tracking in a Graph-Based Wor...,agricultural systems
3,A novel two-stage multi-objective optimization...,agricultural systems
4,A customized multi-neighborhood search algorit...,agricultural systems
5,Evaluation of surface water quality in Heilong...,agricultural systems
6,Sustainable irrigation of pipeline fluid flow ...,agricultural systems
7,A hybrid Lagrangian-dispersion model for spray...,agricultural systems
8,Agent-based model development of a complex soc...,agricultural systems
9,Parametric analysis for exergetic optimisation...,agricultural systems


In [21]:
# Quitar duplicados
valores_duplicados = scopus[scopus["title"].duplicated()]
scopus0 = scopus.drop_duplicates(subset='title', keep='first')

print("El total de duplicados son ", len(valores_duplicados))
print("El total original de observaciones de COPA son ", len(scopus))
print("El nuevo total de observaciones de COPA son ", len(scopus0))
print(scopus0['classification'].value_counts())

El total de duplicados son  3841
El total original de observaciones de COPA son  53226
El nuevo total de observaciones de COPA son  49385
classification
agricultural systems             6996
energy systems                   6955
production systems               6770
health systems                   6752
transportation systems           6507
urban systems                    6438
sustainable systems              5773
financial engineering systems    3194
Name: count, dtype: int64


In [23]:
scopus_procesado = scopus0.copy()

#Elminar caracteres especiales y mayúsculas

for i in scopus_procesado.index:
  texto = scopus_procesado.loc[i, "title"]
  texto = caracteres_especiales(texto) # Quitar caracteres especiales
  texto = texto.lower() # Estandarizar a minúsculas
  scopus_procesado.loc[i, "title"] = texto

scopus_procesado.head()

Unnamed: 0,title,classification
0,assessing multivariate effect of best manageme...,agricultural systems
1,predicting daily solar radiation using a novel...,agricultural systems
2,multi hypothesis tracking in a graph based wor...,agricultural systems
3,a novel two stage multi objective optimization...,agricultural systems
4,a customized multi neighborhood search algorit...,agricultural systems


In [25]:
for i in scopus_procesado.index:
  doc = nlp(scopus_procesado.loc[i, "title"])
  palabras = [token.lemma_ for token in doc if not token.is_stop] 
  texto = ' '.join(palabras)
  scopus_procesado.loc[i, "title"] = texto

scopus_procesado.head()

Unnamed: 0,title,classification
0,assess multivariate effect good management pra...,agricultural systems
1,predict daily solar radiation novel hybrid lon...,agricultural systems
2,multi hypothesis tracking graph base world mod...,agricultural systems
3,novel stage multi objective optimization model...,agricultural systems
4,customize multi neighborhood search algorithm ...,agricultural systems


In [28]:
# Revisar duplicados nuevamente 

valores_duplicados1 = scopus_procesado[scopus_procesado["title"].duplicated()]
scopus1 = scopus_procesado.drop_duplicates(subset='title', keep='first')

print("El total de duplicados son", len(valores_duplicados1))
print("El nuevo total de observaciones de COPA son", len(scopus1))
print(scopus1['classification'].value_counts())

El total de duplicados son 5
El nuevo total de observaciones de COPA son 49380
classification
agricultural systems             6995
energy systems                   6955
production systems               6768
health systems                   6751
transportation systems           6507
urban systems                    6437
sustainable systems              5773
financial engineering systems    3194
Name: count, dtype: int64


### Unir las dos muestras

In [29]:
# Se especifica el número de datos por categoría para que queden balanceados
totalxcat = {'agricultural systems': 3194, 'energy systems': 3192, 'financial engineering systems': 3194, 'health systems': 3182, 'transportation systems': 3178, 'production systems': 3184, 'sustainable systems': 3198, 'urban systems': 3169}

scopus2 = scopus1.groupby('classification').apply(lambda x: x.sample(n=totalxcat[x['classification'].iloc[0]]))
scopus2.reset_index(drop=True, inplace=True) #reinicia índices
scopus2.tail()

Unnamed: 0,title,classification
25486,threshold model urban development,urban systems
25487,nonlinear relationship urban form street level...,urban systems
25488,dynamic trajectory base traffic dispersion met...,urban systems
25489,aerodynamic analysis ak 47 bullet move mach 2 ...,urban systems
25490,ucdnet deep learning model urban change detect...,urban systems


In [30]:
datos = pd.concat([copa1, scopus2], ignore_index=True)
print(len(datos))
print(datos['classification'].value_counts())

25608
classification
health systems                   3201
transportation systems           3201
energy systems                   3201
urban systems                    3201
production systems               3201
agricultural systems             3201
sustainable systems              3201
financial engineering systems    3201
Name: count, dtype: int64


In [31]:
#Stemming

import spacy
from nltk.stem import PorterStemmer

# Inicializar el stemmer de NLTK
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

# Función para realizar stemming 
def stemming(texto):
    obs = [stemmer.stem(token.lemma_) for token in nlp(texto)]
    return " ".join(obs)

datos_finales = datos.copy()
for i in datos.index:
  datos_finales.loc[i, "title"] = stemming(datos.loc[i, "title"])

datos_finales.head(30)

Unnamed: 0,title,classification
0,cochlear implant versu hear aid middl incom co...,health systems
1,econom evalu kangaroo mother care cost util an...,health systems
2,exact bidirect algorithm expect travel time pa...,transportation systems
3,optim waterflood manag emb predict analyt model,energy systems
4,pedestrian evacu plan unveil evacu rout column...,urban systems
5,smart pool ai power covid 19 inform group test,health systems
6,column orient optim approach gener correl rand...,production systems
7,build environ origin rout destin associ bicycl...,transportation systems
8,short alpha reliabl path problem,transportation systems
9,role engin covid 19 pandem,urban systems


In [32]:
#Refinar detalles
for i in datos_finales.index:
  datos_finales.loc[i, "title"] = re.sub(r'\s+', ' ', datos_finales.loc[i, "title"])

In [34]:
# Descargar datos procesados
datos_finales.to_csv('titulos_procesados.csv', index=False)