Consistencia en strings para casos complejos
===

* Ultima actualización: Marzo 6, 2023 | [YouTube](https://www.youtube.com/watch?v=4DzusT3Nl-Q&list=PLEFpZ3YehTnDX6z1fx4rT0wkt80ythBhc&index=7)

In [1]:
import numpy as np
import pandas as pd

print(np.__version__)
print(pd.__version__)

1.23.5
1.5.2


In [2]:
%%writefile /tmp/data.csv
id,raw_keyword
0,solar power forecast
1,solar power forecasting
2,solar power forecasts
3,convolutional neural network
4,convolutional neural networks
5,convolution neural network
6,Convolution neural networks
7,solar power forecasting
8,Convolutional neural network
9,solar power forecasts
10,prediction of solar power 
11,data analysis
12,analysis of data
13,convolution neural network
14,solar power forecasts
15,Convolution neural networks
16,convolution neural network

Overwriting /tmp/data.csv


In [3]:
df = pd.read_csv('/tmp/data.csv')
df

Unnamed: 0,id,raw_keyword
0,0,solar power forecast
1,1,solar power forecasting
2,2,solar power forecasts
3,3,convolutional neural network
4,4,convolutional neural networks
5,5,convolution neural network
6,6,Convolution neural networks
7,7,solar power forecasting
8,8,Convolutional neural network
9,9,solar power forecasts


In [4]:
##
## Búsqueda de valores inconsistentes. 
##
df.raw_keyword.value_counts()

solar power forecasts            3
convolution neural network       3
solar power forecasting          2
Convolution neural networks      2
solar power forecast             1
convolutional neural network     1
convolutional neural networks    1
Convolutional neural network     1
prediction of solar power        1
data analysis                    1
analysis of data                 1
Name: raw_keyword, dtype: int64

In [5]:
##
## Creación del dataframe de trabajo
##
df = df.assign(key=df.raw_keyword)
df

Unnamed: 0,id,raw_keyword,key
0,0,solar power forecast,solar power forecast
1,1,solar power forecasting,solar power forecasting
2,2,solar power forecasts,solar power forecasts
3,3,convolutional neural network,convolutional neural network
4,4,convolutional neural networks,convolutional neural networks
5,5,convolution neural network,convolution neural network
6,6,Convolution neural networks,Convolution neural networks
7,7,solar power forecasting,solar power forecasting
8,8,Convolutional neural network,Convolutional neural network
9,9,solar power forecasts,solar power forecasts


In [6]:
##
## Se transforman todas las palabras a minusculas
##
df.key = df.key.str.lower()
df

Unnamed: 0,id,raw_keyword,key
0,0,solar power forecast,solar power forecast
1,1,solar power forecasting,solar power forecasting
2,2,solar power forecasts,solar power forecasts
3,3,convolutional neural network,convolutional neural network
4,4,convolutional neural networks,convolutional neural networks
5,5,convolution neural network,convolution neural network
6,6,Convolution neural networks,convolution neural networks
7,7,solar power forecasting,solar power forecasting
8,8,Convolutional neural network,convolutional neural network
9,9,solar power forecasts,solar power forecasts


In [7]:
##
## Se instala TextBlob
##
# !pip3 install --quiet textblob
#
# import nltk
#
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [8]:
##
## Lematización
##
from textblob import TextBlob

df.key = df.key.map(lambda x: [word.lemmatize("v") for word in TextBlob(x).words])
df.key

0             [solar, power, forecast]
1             [solar, power, forecast]
2             [solar, power, forecast]
3     [convolutional, neural, network]
4     [convolutional, neural, network]
5       [convolution, neural, network]
6       [convolution, neural, network]
7             [solar, power, forecast]
8     [convolutional, neural, network]
9             [solar, power, forecast]
10      [prediction, of, solar, power]
11                    [data, analysis]
12                [analysis, of, data]
13      [convolution, neural, network]
14            [solar, power, forecast]
15      [convolution, neural, network]
16      [convolution, neural, network]
Name: key, dtype: object

In [9]:
df.key = df.key.map(sorted)
df.key = df.key.str.join(" ")
df.sort_values('key')

Unnamed: 0,id,raw_keyword,key
11,11,data analysis,analysis data
12,12,analysis of data,analysis data of
16,16,convolution neural network,convolution network neural
13,13,convolution neural network,convolution network neural
15,15,Convolution neural networks,convolution network neural
5,5,convolution neural network,convolution network neural
6,6,Convolution neural networks,convolution network neural
8,8,Convolutional neural network,convolutional network neural
4,4,convolutional neural networks,convolutional network neural
3,3,convolutional neural network,convolutional network neural


In [10]:
df.key = df.key.str.replace(r"\bof\b", "", regex=True)
df.key = df.key.str.replace(r"\bconvolution\b", "convolutional", regex=True)
df.key = df.key.str.replace(r"\bprediction\b", "forecast", regex=True)
df.sort_values('key')

Unnamed: 0,id,raw_keyword,key
10,10,prediction of solar power,power forecast solar
11,11,data analysis,analysis data
12,12,analysis of data,analysis data
8,8,Convolutional neural network,convolutional network neural
13,13,convolution neural network,convolutional network neural
15,15,Convolution neural networks,convolutional network neural
16,16,convolution neural network,convolutional network neural
5,5,convolution neural network,convolutional network neural
4,4,convolutional neural networks,convolutional network neural
3,3,convolutional neural network,convolutional network neural


In [11]:
df.key = df.key.map(lambda x: TextBlob(x).words)
df.key = df.key.map(sorted)
df.key = df.key.str.join(" ")
df.sort_values('key')

Unnamed: 0,id,raw_keyword,key
12,12,analysis of data,analysis data
11,11,data analysis,analysis data
8,8,Convolutional neural network,convolutional network neural
13,13,convolution neural network,convolutional network neural
15,15,Convolution neural networks,convolutional network neural
6,6,Convolution neural networks,convolutional network neural
16,16,convolution neural network,convolutional network neural
4,4,convolutional neural networks,convolutional network neural
3,3,convolutional neural network,convolutional network neural
5,5,convolution neural network,convolutional network neural


In [12]:
keywords = df.groupby(['key'], as_index=False).agg({'raw_keyword': list} )
keywords

Unnamed: 0,key,raw_keyword
0,analysis data,"[data analysis, analysis of data]"
1,convolutional network neural,"[convolutional neural network, convolutional n..."
2,forecast power solar,"[solar power forecast, solar power forecasting..."


In [13]:
mapping = {word: words[0] for words in keywords.raw_keyword for word in words}
mapping

{'data analysis': 'data analysis',
 'analysis of data': 'data analysis',
 'convolutional neural network': 'convolutional neural network',
 'convolutional neural networks': 'convolutional neural network',
 'convolution neural network': 'convolutional neural network',
 'Convolution neural networks': 'convolutional neural network',
 'Convolutional neural network': 'convolutional neural network',
 'solar power forecast': 'solar power forecast',
 'solar power forecasting': 'solar power forecast',
 'solar power forecasts': 'solar power forecast',
 'prediction of solar power ': 'solar power forecast'}

In [14]:
df = df.assign(cleaned_keyword=df.raw_keyword.map(mapping))
df

Unnamed: 0,id,raw_keyword,key,cleaned_keyword
0,0,solar power forecast,forecast power solar,solar power forecast
1,1,solar power forecasting,forecast power solar,solar power forecast
2,2,solar power forecasts,forecast power solar,solar power forecast
3,3,convolutional neural network,convolutional network neural,convolutional neural network
4,4,convolutional neural networks,convolutional network neural,convolutional neural network
5,5,convolution neural network,convolutional network neural,convolutional neural network
6,6,Convolution neural networks,convolutional network neural,convolutional neural network
7,7,solar power forecasting,forecast power solar,solar power forecast
8,8,Convolutional neural network,convolutional network neural,convolutional neural network
9,9,solar power forecasts,forecast power solar,solar power forecast
