In [2]:
from difflib import SequenceMatcher
import pandas as pd 
import numpy as np
from lib.location_class import Location
import regex as re
import unicodedata
from lib.text_processor_class import *

# Text processor

I need to create a text processor in order to:
- Clean strings, Remove all not alphanumeric values
- Check how similar are all words of a list and replace most frequent 
- Check the language of the comments
- Check if a comment is possitive or negative based on the language

I will include all this functions inside a class called text_processor

#### Retrieve the data

In [3]:
ascent_df = pd.read_csv('../../databases/ascent_grade_correction.csv', low_memory=False)
lc = Location(ascent_df)

Location class initialized


In [6]:
crags = lc.crags_in_country('ESP')
myset1 = set(crags)
myset1

{nan,
 'VISTAMAR',
 'risco de villarejo',
 'Pradera del Turbón',
 'RASICA (Carbellino)',
 'Hces de Vegacervera',
 'Costa Jaizkibel',
 'sella.',
 'castellar',
 'sta.coloma farners',
 'Sunbilla',
 'Ordal',
 'Face Est',
 'La Pobla de Segur',
 'canelles',
 'El Palomar',
 'Tarteru',
 'yoyo-drea',
 'Cresta del Gallo, Murcia',
 'Dronningen',
 'es pixarells',
 'Buendia',
 "Loja'",
 'Solarium',
 'Parga (ugo)',
 'benabarre',
 'Valencia del ventoso',
 'Torcal',
 'Peña Aragonesas',
 "Sa font d'es rafal",
 'Can Ximet',
 'frontales',
 'Arran de Batistot',
 'Punta de Gabás',
 'como la vida misma',
 'muro central',
 'bellavista',
 'La puerta falsa',
 'Tavernes',
 'La  Pedriza',
 'Peñalara',
 'Arenas de Cabrales',
 'Bobeda gandia',
 'les escletxes (el papiol)',
 'Cala varquez',
 'Cuatrovalles',
 'Secret Spot',
 'el bellus',
 'Cara sur.',
 'Ermita de arta',
 'la morera',
 'castelldans',
 'Grillons',
 'Asombrao',
 'La panocha',
 'Las perchas',
 'peña castilla',
 'Santivañez de Bejar',
 'Bruce Lee Buttres

## · String cleaning

Lowercase

In [11]:
def lower_array(a):
    return [(str(word)).lower() for word in a]

In [12]:
crags_low = lower_array(crags)
myset2 = set(crags_low)
len(myset2)

7099

Remove accents

In [13]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def strip_accents_array(a):
    return [strip_accents(str(word)) for word in a]

In [14]:
strip_accents('qué pach¢aà neeng')

'que pach¢aa neeng'

In [15]:
crags_strip = strip_accents_array(crags_low)
myset3 = set(crags_strip)
len(myset3)

6728

Remove punctuations

In [16]:
def remove_punct (s):
    # replace - and / by space
    s0 = s.replace('-', ' ').replace('/', ' ')
    
    # replace 2+ spaces by 1 space
    t = re.compile(r"\s+")
    s1 = t.sub(" ", s0).strip()
    
    # remove punctuations
    s2 = re.sub(r'[^A-Za-z ]+', '',s1)
    
    # remove first space   
    s3 = re.sub('^\s', '',s2)

    # remove last space   
    s4 = s3.rstrip()
    
    return s4

def remove_punct_array(a):
    return [remove_punct(str(word)) for word in a]

In [17]:
remove_punct(" Éramos   pocos y       ,)?7df ri9ío la abuela#")

'ramos pocos y df rio la abuela'

In [18]:
crag_punct = remove_punct_array(crags_strip)
myset4 = set(crag_punct)
len(myset4)

6507

Remove first "el, la, the"

In [19]:
def remove_first_the(s1):
    return [re.sub('(?:^|(?:[.!?]\s))(the |el |la |las |los )', '',word) for word in s1]

In [21]:
remove_first_the(["los dias","las noches"])

['dias', 'noches']

#### We put all together in one function

In [22]:
def cleaning_function(a):
    '''
    function used for clining a list of strings    
    '''    
    a1 = lower_array(a)
    
    a2 = strip_accents_array(a1)
    
    a3 = remove_punct_array(a2)
    
    a4 = remove_first_the(a3)
    
    return a4

In [23]:
cleaning_function([" ·el!$% %$CaStor ·de flor$es "])

['castor de flores']

Using the new class created

In [8]:
txt = Text_processor()
txt.cleaning_function(crags)

Initialized text_processor class


['arboli',
 'montsant',
 'siurana',
 'chorro',
 'port de soller',
 'mallorca',
 'jerica',
 'cuenca',
 'siurana',
 'onate',
 'baltzola',
 'larraona',
 'sella',
 'atauri',
 'oro',
 'apellaniz',
 'montgrony',
 'etxauri',
 'egino',
 'patones',
 'mallorca',
 'rodellar',
 'cala magraner',
 'terradets',
 'salem',
 'salinas',
 'gandia',
 'madrid',
 'basqueland',
 'vilanova de prades',
 'riglos',
 'lliber',
 'olta',
 'sierra de toix',
 'forada',
 'escorial',
 'pedriza',
 'liendo',
 'montserrat',
 'torcal',
 'riba',
 'port de soller',
 'sadernes',
 'gandia',
 'araotz',
 'ramales',
 'convento',
 'teverga',
 'desplomilandia',
 'arico',
 'cala santanyi',
 'sestret',
 'pelugano',
 'montanejos',
 'sant llorenc de montgai',
 'sant llorenc del munt',
 'camarasa',
 'sa gubia',
 'sestret',
 'valdegobia',
 'cahorros',
 'loja',
 'loja',
 'morata de jalon',
 'vellon',
 'agujas rojas',
 'cogollos',
 'vellon',
 'quiros',
 'morata de jalon',
 'valeria',
 'canon de uceda',
 'reguchillo',
 'mussara',
 'pedrosa',

## · Similarity

If they are included into each other

In [25]:
def included(a,b):
    # If the word is included in a phrase
    a = " " + a + " "
    b = " " + b + " "
    if (a in b) or (b in a):
        return True
    return False        

In [26]:
included("hola", "hola mundo")

True

If they look similar

In [27]:
def similar(a, b):
    return SequenceMatcher(None, a, b, autojunk=True).ratio()

In [28]:
similar('hola','olo')

0.5714285714285714

Replace by the most frequent

In [29]:
def replace_frequent(a,b,ls):
    '''
    Compare the ocurrences of each string of a list and 
    then returns a list with the strings replaced by the most frequent value
    '''
    if ls.count(a) > ls.count(b):
        return [a if (s == b) else s for s in ls ]
    else:
        return [b if (s == a) else s for s in ls ]
    

In [30]:
ls = ['a1','a2','a3','b', 'c', 'a', 'd', 'b', 'c', 'd', 'c', 'a', 'b','a','a1','a2','a','b']

In [31]:
ls_rep = replace_frequent('a1','b',ls)
ls_rep

['b',
 'a2',
 'a3',
 'b',
 'c',
 'a',
 'd',
 'b',
 'c',
 'd',
 'c',
 'a',
 'b',
 'a',
 'b',
 'a2',
 'a',
 'b']

Put all together

In [32]:
def Replace_similar_array(arr, similarity = 0.9, similarity_ask = 1, show = False):
    '''
    This functions compares every string of a list with each other and replaces if they are similar.
    It keeps the most frequent value and replaces the occurences of the other one

    input:
    arr -> np.array with the strings we want to compare
    similarity -> the similarity threshold, automatically does the correction
    similarity_ask -> this threshold asks the user if the text is similar or not before correcting
    show -> True for a results viewer

    output:
    The np.array with the similar values replaced
    '''

    str_arr = arr.copy()
    

    for index1 in range(len(str_arr)):
        for index2 in range(index1 + 1,len(str_arr)):
                            
            name1 = str_arr[index1]
            name2 = str_arr[index2] 
                            
                            
            if name1 != name2:
                inc = included(name1,name2)

                if inc:
                    str_arr = replace_frequent(name1,name2,str_arr)

                else:               
                    sim = similar(name1,name2)

                    if (sim > similarity) & (sim != 1):

                        if (show == True):
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)
                        str_arr = replace_frequent(name1,name2,str_arr) 

                    elif (sim > similarity_ask) & (sim != 1):

                        print('Are they the same? (n if not) -> ', name1, " <<>> ",name2, " (", sim, ")")
                        x = input()

                        if x != 'n':
                            str_arr = replace_frequent(name1,name2,str_arr)
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)


    return str_arr  

In [34]:
%%time
crag_sim = Replace_similar_array(arr = crag_punct, 
              similarity = 0.85, # playing a little with this similarity values found that .8 is a good value
              similarity_ask = 1,              
              show = True)

Replacing all the values with larboli by arboli . Similarity:  0.9230769230769231
Replacing all the values with arbolii by arboli . Similarity:  0.9230769230769231
Replacing all the values with aboli by arboli . Similarity:  0.9090909090909091
Replacing all the values with arboi by arboli . Similarity:  0.9090909090909091
Replacing all the values with arbolli by arboli . Similarity:  0.9230769230769231
Replacing all the values with mont sant by montsant . Similarity:  0.9411764705882353
Replacing all the values with mon sant by mont sant . Similarity:  0.9411764705882353
Replacing all the values with monsant by mont sant . Similarity:  0.875
Replacing all the values with motsant by mont sant . Similarity:  0.875
Replacing all the values with suriana by siurana . Similarity:  0.8571428571428571
Replacing all the values with siruana by siurana . Similarity:  0.8571428571428571
Replacing all the values with suiurana by siurana . Similarity:  0.9333333333333333
Replacing all the values wit

Replacing all the values with el trocal by el torcal . Similarity:  0.8888888888888888
Replacing all the values with lariba by la riba . Similarity:  0.9230769230769231
Replacing all the values with la rib by la riba . Similarity:  0.9230769230769231
Replacing all the values with la arriba by la riba . Similarity:  0.875
Replacing all the values with sadernas by sadernes . Similarity:  0.875
Replacing all the values with sadenas by sadernas . Similarity:  0.9333333333333333
Replacing all the values with arotz by araotz . Similarity:  0.9090909090909091
Replacing all the values with arahotz by araotz . Similarity:  0.9230769230769231
Replacing all the values with rafales by ramales . Similarity:  0.8571428571428571
Replacing all the values with el covento by el convento . Similarity:  0.9523809523809523
Replacing all the values with t verga by teverga . Similarity:  0.8571428571428571
Replacing all the values with te ver ga by teverga . Similarity:  0.875
Replacing all the values with d

KeyboardInterrupt: 

Thi process is quite slow, we want to make it <b> faster </b> 

Replacements table: We are going to create a <b>table</b> of replacements and then replace from it the original list

In [35]:
def replace_frequent_uniq(a,b,uniq_ls,orig_ls):
    
    freq = a if (orig_ls.count(a) > orig_ls.count(b)) else b
   
    if freq == a:
        return [a if (s == b) else s for s in uniq_ls ]
    else:
        return [b if (s == a) else s for s in uniq_ls ]

In [36]:
uniq_ls = pd.array(ls).unique()
ls_rep = replace_frequent_uniq('a','b',list(uniq_ls),ls)

In [37]:
rep_table = pd.DataFrame({'original':uniq_ls,'most_freq':ls_rep})
rep_table

Unnamed: 0,original,most_freq
0,a1,a1
1,a2,a2
2,a3,a3
3,b,b
4,c,c
5,a,b
6,d,d


In [38]:
[rep_table[rep_table.original == value].most_freq.values[0] for value in ls]

['a1',
 'a2',
 'a3',
 'b',
 'c',
 'b',
 'd',
 'b',
 'c',
 'd',
 'c',
 'b',
 'b',
 'b',
 'a1',
 'a2',
 'b',
 'b']

In [39]:
def replace_frequent_with_table(orig_ls,similarity = 0.85, similarity_ask = 1, show = False):
    '''
    This method is replacing the similar values of a list by the most frequent one.
    In this approach first we calculate the unique values and then we make the replacement table.
    Finally we search for the replacements of the list on this table saving us time (exponentially)
    
    input:
    orig_ls -> the list
    show -> if true it shows all the logs
    
    output:
    it returns the list
    
    '''
    arr_original = orig_ls.copy()
    
    arr_uniq = pd.array(arr_original).unique()
    
    arr_uniq_copy = arr_uniq.copy()

    for index1 in range(len(arr_uniq)):
        for index2 in range(index1 + 1,len(arr_uniq)):
                            
            name1 = arr_uniq_copy[index1]
            name2 = arr_uniq_copy[index2] 
                            
                            
            if name1 != name2:
                inc = included(name1,name2)

                if inc:
                    arr_uniq_copy = replace_frequent_uniq(name1, name2, arr_uniq_copy, arr_original)

                else:               
                    sim = similar(name1,name2)

                    if (sim > similarity) & (sim != 1):

                        if (show == True):
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)
                        arr_uniq_copy = replace_frequent_uniq(name1,name2,arr_uniq_copy,arr_original) 

                    elif (sim > similarity_ask) & (sim != 1):

                        print('Are they the same? (n if not) -> ', name1, " <<>> ",name2, " (", sim, ")")
                        x = input()

                        if x != 'n':
                            arr_uniq_copy = replace_frequent_uniq(name1, name2, arr_uniq_copy, arr_original)
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)
                            
                            
    
    # We create the table
    rep_table = pd.DataFrame({'original':arr_uniq,'most_freq':arr_uniq_copy})
    
    if show:
        display(rep_table)
    
    ls_orig_rep = [rep_table[rep_table.original == value].most_freq.values[0] for value in orig_ls]
    
    return ls_orig_rep

In [40]:
ls = ['a1','a2','a3','b', 'c1', 'a', 'd', 'b1', 'c', 'd', 'c', 'a1', 'b','a','a1','a2','a','b1',
     'a1','a2','a3','b', 'c2', 'a', 'd', 'b2', 'c', 'd', 'c', 'a2', 'b','a','a1','a2','a','b',
     'a1','a2','a3','b', 'c3', 'a', 'd', 'b1', 'c', 'd', 'c', 'a1', 'b','a','a1','a2','a','b2']

In [41]:
ans = replace_frequent_with_table(ls,similarity = 0.5, show = True)

Replacing all the values with a by a1 . Similarity:  0.6666666666666666
Replacing all the values with a by a2 . Similarity:  0.6666666666666666
Replacing all the values with a by a3 . Similarity:  0.6666666666666666
Replacing all the values with b1 by b . Similarity:  0.6666666666666666
Replacing all the values with b2 by b . Similarity:  0.6666666666666666
Replacing all the values with c by c1 . Similarity:  0.6666666666666666
Replacing all the values with c2 by c . Similarity:  0.6666666666666666
Replacing all the values with c3 by c . Similarity:  0.6666666666666666


Unnamed: 0,original,most_freq
0,a1,a
1,a2,a
2,a3,a
3,b,b
4,c1,c
5,a,a
6,d,d
7,b1,b
8,c,c
9,c2,c


Lets see how fast it is:

In [48]:
%%time
crag_sim = replace_frequent_with_table(orig_ls = crag_punct,
                                       show = False)

CPU times: user 6min 53s, sys: 160 ms, total: 6min 53s
Wall time: 6min 53s


#### ...we see that we saved a lot of time with this table method

In [49]:
myset5 = set(crag_sim)
len(myset5)

3542

In [50]:
pd.Series(crag_sim).value_counts().head(10)

sant llorenc del munt    33
a                        33
siurana                  31
st llorenc de montgai    31
                         29
sestret                  29
la pedriza               28
albarracin               25
margalef                 24
chulilla                 23
dtype: int64

A big improvement respect the original:

In [51]:
pd.Series(crags).value_counts().head(10)

Arboli              1
merola              1
Petonn do Xalo      1
Port d'escanonge    1
crevillent          1
Cinfuens            1
Oncins              1
Priego              1
El pedriza          1
Cabeço gordo        1
dtype: int64

We now put everything inside the text_processor class

In [None]:
crag_sim = txt.replace_frequent_with_table(arr = sorted(myset5))
myset6 = set(crag_sim)
len(myset6)

## · Language check

In [52]:
from polyglot.detect import Detector
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentiment_analysis_spanish import sentiment_analysis

In [53]:
mixed_text = "La via es muy buena"
det = Detector(mixed_text, quiet=True)
print(det.language)

name: Spanish     code: es       confidence:  95.0 read bytes:  1280


In [54]:
def Language_check(mixed_text, lang_check = 'en'):
    '''
    This function checks if the language of the text is the expected

    inputs:
    - mixef_text => text to analyze
    - lang_check => the language we want to validate

    output:
    boolean, true or false
    '''



    det = Detector(mixed_text, quiet=True)
    lang = det.languages[0].code
    conf = det.languages[0].confidence

    if (lang == lang_check) and (conf > 0.5):
        return True

    return False

In [55]:
Language_check("That was a good route",'en')

True

Now we use the class text_processor

In [58]:
txt.Language_check("La via es buenisima, y lo demas tambien",'es')

True

## · Sentiment

This function will messure the sentiment of a comment

In [59]:
analyzer_en = SentimentIntensityAnalyzer()
analyzer_es = sentiment_analysis.SentimentAnalysisSpanish()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [60]:
analyzer_en.polarity_scores("Best climb of my life")

{'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'compound': 0.6369}

In [61]:
analyzer_es.sentiment("Una escalada muy mala")

0.0004283214610584631

In [62]:
# English
def English_sentiment(sentence, analyzer_en, show = False):
    '''
    This function analyzes a sentence in english to see if it is possitive or negative

    inputs:
    - sentence => text to analyze
    - show => if its True then it will show the results 

    output:
    returns a float between +-1 whith the result 
    '''   


    vs = analyzer_en.polarity_scores(sentence)
    if (vs['compound'] != 0) and show:
        print( '----------------------------------------------' )
        print(sentence)
        print(vs['compound'])

    return vs['compound']

#Spanish
def Spanish_sentiment(sentence, analyzer_es, show = False):
    '''
    This function analyzes a sentence in spanish to see if it is possitive or negative

    inputs:
    - sentence => text to analyze
    - show => if its True then it will show the results 

    output:
    returns a float between +-1 whith the result 
    '''   

    vs = analyzer_es.sentiment(sentence)*2-1
    if (vs != 0) and show:
        print( '----------------------------------------------' )
        print(sentence)
        print(vs)

    return vs

In [63]:
English_sentiment("Good route", analyzer_en, show = True)

----------------------------------------------
Good route
0.4404


0.4404

In [64]:
Spanish_sentiment("La via una maravilla", analyzer_es, show = True)

----------------------------------------------
La via una maravilla
0.9938294856508776


0.9938294856508776

### Language check + Sentiment

In [65]:
def comment_sentiment(comment, analyzer_en, analyzer_es, show = False):
    '''
    This functions takes a comment, checks the language and if its english it checks the sentiment
    input:
    - comment -> text to analyze
    - show -> if you want to show results
    
    output:
    sentiment is a value between +-1
    ''' 
    
    try:
        # Check if its in english
        is_english = Language_check(comment)

        if is_english:
            # Check the sentiment
            return Sentiment_analyzer(comment, analyzer_en, show)

        # Works for both
        is_spanish = Language_check(comment,'es')
        is_italian = Language_check(comment,'it')

        if is_spanish or is_italian:
            # Check the sentiment
            return Spanish_sentiment(comment, analyzer_es, show)            

        return 0
        
    except:
        return 0

In [66]:
comment_sentiment("Good route, guys", analyzer_en, analyzer_es, show = True)

0

In [67]:
comment_sentiment("La via una maravilla y lo demas", analyzer_en, analyzer_es, show = True)

----------------------------------------------
La via una maravilla y lo demas
0.989722003917165


0.989722003917165

Now we place it into the <b>class</b> in order to be able to use it in the future

In [68]:
txt.Language_check("Good route, guys, it was amazing")

True

In [69]:
txt.English_sentiment("Good route, guys, it was amazing")

0.7717

In [70]:
txt.comment_sentiment("Good route, guys, it was amazing", show = True)

----------------------------------------------
Good route, guys, it was amazing
0.7717


0.7717

In [71]:
txt.comment_sentiment("La via una maravilla y lo demas", show = True)

----------------------------------------------
La via una maravilla y lo demas
0.989722003917165


0.989722003917165