In [1]:
from difflib import SequenceMatcher
import pandas as pd 
import numpy as np
from location_class import location
import regex as re
import unicodedata
from text_processor_class import *

# Text processor

I need to create a text processor in order to:
- Clean strings, Remove all not alphanumeric values
- Check how similar are all words of a list and replace most frequent 
- Check the language of the comments
- Check if a comment is possitive or negative based on the language

I will include all this functions inside a class called text_processor

#### Retrieve the data

In [2]:
ascent_df = pd.read_csv('../../databases/ascent_grade_correction.csv', low_memory=False)
lc = location(ascent_df)

Location class initialized


In [3]:
crags = lc.crags_in_country('SWE')
myset1 = set(crags)
myset1

{nan,
 'Chip väggen',
 'Block 1',
 'Henriksdal',
 'Sveafallen',
 'segersäng',
 'Lostenen /Falun',
 'vikdal',
 'wryry',
 'klockarnäs',
 'Djupdalen',
 'Ekensberg',
 'Forsby, Västerås',
 'Brommablocken',
 'Norra Gryta, Motörheadblocket',
 'Stavsjöåsen',
 'garpedans',
 'Cashväggen',
 'Djura',
 'Borgväggen, Sollentuna',
 'Ulorna norra',
 'Lögarbergen',
 'Grottan',
 'skansberget',
 'Magreteberg',
 'Slattefors',
 'Brevikskorset',
 'Nordens Ark',
 'furukullen',
 '"de små"',
 'Skatås',
 'Sätra',
 'Mosjön',
 'Norra ravinen',
 'Brudis',
 'Evenröd',
 'Hult',
 'Gläntan',
 'N Svalehult (lillråttan)',
 'Ragnhilds näsa',
 'Campusblocket',
 'Hållsta',
 'Jättarna i Kvibille',
 'Snibe stua',
 'mauritzholm',
 'Lilla berget',
 'oldberget',
 'Grå väggen',
 'lötsjöbouldern',
 'telefonplan',
 'Udden',
 'Sörfjärden',
 'Stenebygrottan',
 'Karlshamn',
 'C-3',
 'Esporlas',
 'årjängsberget',
 'Vulturino',
 'Kvarnekas. Dals-Ed',
 'Tortuna',
 'Gårdskär',
 'Källbergahygget',
 'Sundsvall/Södra berget',
 'Björnöslingan

## · String cleaning

Lowercase

In [4]:
def lower_array(a):
    return [(str(word)).lower() for word in a]

In [5]:
crags_low = lower_array(crags)
myset2 = set(crags_low)
len(myset2)

2345

Remove accents

In [6]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def strip_accents_array(a):
    return [strip_accents(str(word)) for word in a]

In [7]:
strip_accents('qué pach¢aà neeng')

'que pach¢aa neeng'

In [8]:
crags_strip = strip_accents_array(crags_low)
myset3 = set(crags_strip)
len(myset3)

2307

Remove punctuations

In [9]:
def remove_punct (s):
    # replace - and / by space
    s0 = s.replace('-', ' ').replace('/', ' ')
    
    # replace 2+ spaces by 1 space
    t = re.compile(r"\s+")
    s1 = t.sub(" ", s0).strip()
    
    # remove punctuations
    s2 = re.sub(r'[^A-Za-z ]+', '',s1)
    
    # remove first space   
    s3 = re.sub('^\s', '',s2)

    # remove last space   
    s4 = s3.rstrip()
    
    return s4

def remove_punct_array(a):
    return [remove_punct(str(word)) for word in a]

In [10]:
remove_punct(" Éramos   pocos y       ,)?7df ri9ío la abuela#")

'ramos pocos y df rio la abuela'

In [11]:
crag_punct = remove_punct_array(crags_strip)
myset4 = set(crag_punct)
len(myset4)

2233

#### We put all together in one function

In [12]:
def cleaning_function(a):
    '''
    function used for clining a list of strings    
    '''    
    a1 = lower_array(a)
    
    a2 = strip_accents_array(a1)
    
    a3 = remove_punct_array(a2)
    
    return a3

Using the new class created

In [13]:
txt = text_processor()
txt.cleaning_function(crags_strip)

Initialized text_processor class


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


['sjoanda',
 'gubben i berget',
 'tjornbroklippan',
 'seglora',
 'aspen',
 'utby',
 'svanvik',
 'vastervik',
 'hylteberget',
 'viks kile',
 'korpaberget',
 'vindon',
 'tollsjo',
 'rasklippan',
 'nacka kvarn',
 'kjugekull',
 'flaten',
 'farsta',
 'brudberget',
 'uringe',
 'agelsjon',
 'dynestad',
 'gunnilse',
 'lexby',
 'munchenbryggeriet',
 'hogdalen',
 'ekstubbeberget',
 'gaseborg',
 'dodskalleberget',
 'kulberget',
 'kullaberg',
 'kallsberget',
 'hundberget',
 'niemisel',
 'blaberget',
 'aberget',
 'ornberget',
 'palstorp',
 'solvik',
 'stugun',
 'bohuslan',
 'vo stanarna',
 'sjotorpsberget',
 'brommaplan',
 'nybro',
 'dyviksudd',
 'stennaset',
 'traskberget',
 'uddeuddberget',
 'margreteberg',
 'stegeborg',
 'ingaro',
 'vrakviken',
 'hono',
 'starberget',
 'krokviken',
 'norra s vall',
 'henriksdal',
 'skevik',
 'midskogs',
 'brattberget',
 'skravelsjo',
 'rosendal',
 'luossa',
 'ramberget',
 'vulturino',
 'orminge',
 'alvnas',
 'anneberg',
 'i',
 'kanalklippan',
 'svarttjarn',
 'be

## · Similarity

If they are included into each other

In [14]:
def included(a,b):
    # If the word is included in a phrase
    a = " " + a + " "
    b = " " + b + " "
    if (a in b) or (b in a):
        return True
    return False        

In [15]:
included("hola", "hola mundo")

True

If they look similar

In [16]:
def similar(a, b):
    return SequenceMatcher(None, a, b, autojunk=True).ratio()

In [17]:
similar('hola','olo')

0.5714285714285714

Replace by the most frequent

In [18]:
def replace_frequent(a,b,ls):
    '''
    Compare the ocurrences of each string of a list and 
    then returns a list with the strings replaced by the most frequent value
    '''
    if ls.count(a) > ls.count(b):
        return [a if (s == b) else s for s in ls ]
    else:
        return [b if (s == a) else s for s in ls ]
    

In [95]:
ls = ['a1','a2','a3','b', 'c', 'a', 'd', 'b', 'c', 'd', 'c', 'a', 'b','a','a1','a2','a','b']

In [96]:
ls_rep = replace_frequent('a1','b',ls)
ls_rep

['b',
 'a2',
 'a3',
 'b',
 'c',
 'a',
 'd',
 'b',
 'c',
 'd',
 'c',
 'a',
 'b',
 'a',
 'b',
 'a2',
 'a',
 'b']

Put all together

In [97]:
def Replace_similar_array(arr, similarity = 0.9, similarity_ask = 1, show = False):
    '''
    This functions compares every string of a list with each other and replaces if they are similar.
    It keeps the most frequent value and replaces the occurences of the other one

    input:
    arr -> np.array with the strings we want to compare
    similarity -> the similarity threshold, automatically does the correction
    similarity_ask -> this threshold asks the user if the text is similar or not before correcting
    show -> True for a results viewer

    output:
    The np.array with the similar values replaced
    '''

    str_arr = arr.copy()
    

    for index1 in range(len(str_arr)):
        for index2 in range(index1 + 1,len(str_arr)):
                            
            name1 = str_arr[index1]
            name2 = str_arr[index2] 
                            
                            
            if name1 != name2:
                inc = included(name1,name2)

                if inc:
                    str_arr = replace_frequent(name1,name2,str_arr)

                else:               
                    sim = similar(name1,name2)

                    if (sim > similarity) & (sim != 1):

                        if (show == True):
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)
                        str_arr = replace_frequent(name1,name2,str_arr) 

                    elif (sim > similarity_ask) & (sim != 1):

                        print('Are they the same? (n if not) -> ', name1, " <<>> ",name2, " (", sim, ")")
                        x = input()

                        if x != 'n':
                            str_arr = replace_frequent(name1,name2,str_arr)
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)


    return str_arr  

In [94]:
%%time
crag_sim = Replace_similar_array(arr = crag_punct, 
              similarity = 0.85, # playing a little with this similarity values found that .8 is a good value
              similarity_ask = 1,              
              show = False)

CPU times: user 1min 2s, sys: 15.7 ms, total: 1min 2s
Wall time: 1min 2s


Thi process is quite slo, we want to make it <b> faster </b> 

Replacements table: We are going to create a <b>table</b> of replacements and then replace from it the original list

In [166]:
def replace_frequent_uniq(a,b,uniq_ls,orig_ls):
    
    freq = a if (orig_ls.count(a) > orig_ls.count(b)) else b
   
    if freq == a:
        return [a if (s == b) else s for s in uniq_ls ]
    else:
        return [b if (s == a) else s for s in uniq_ls ]

In [167]:
uniq_ls = pd.array(ls).unique()
ls_rep = replace_frequent_uniq('a','b',list(uniq_ls),ls)

In [168]:
rep_table = pd.DataFrame({'original':uniq_ls,'most_freq':ls_rep})
rep_table

Unnamed: 0,original,most_freq
0,a1,a1
1,a2,a2
2,a3,a3
3,b,a
4,c1,c1
5,a,a
6,d,d
7,b1,b1
8,c,c
9,c2,c2


In [169]:
[rep_table[rep_table.original == value].most_freq.values[0] for value in ls]

['a1',
 'a2',
 'a3',
 'a',
 'c1',
 'a',
 'd',
 'b1',
 'c',
 'd',
 'c',
 'a1',
 'a',
 'a',
 'a1',
 'a2',
 'a',
 'b1',
 'a1',
 'a2',
 'a3',
 'a',
 'c2',
 'a',
 'd',
 'b2',
 'c',
 'd',
 'c',
 'a2',
 'a',
 'a',
 'a1',
 'a2',
 'a',
 'a',
 'a1',
 'a2',
 'a3',
 'a',
 'c3',
 'a',
 'd',
 'b1',
 'c',
 'd',
 'c',
 'a1',
 'a',
 'a',
 'a1',
 'a2',
 'a',
 'b2']

In [170]:
def replace_frequent_with_table(orig_ls,similarity = 0.85, similarity_ask = 1, show = False):
    '''
    This method is replacing the similar values of a list by the most frequent one.
    In this approach first we calculate the unique values and then we make the replacement table.
    Finally we search for the replacements of the list on this table saving us time (exponentially)
    
    input:
    orig_ls -> the list
    show -> if true it shows all the logs
    
    output:
    it returns the list
    
    '''
    arr_original = orig_ls.copy()
    
    arr_uniq = pd.array(arr_original).unique()
    
    arr_uniq_copy = arr_uniq.copy()

    for index1 in range(len(arr_uniq)):
        for index2 in range(index1 + 1,len(arr_uniq)):
                            
            name1 = arr_uniq_copy[index1]
            name2 = arr_uniq_copy[index2] 
                            
                            
            if name1 != name2:
                inc = included(name1,name2)

                if inc:
                    arr_uniq_copy = replace_frequent_uniq(name1, name2, arr_uniq_copy, arr_original)

                else:               
                    sim = similar(name1,name2)

                    if (sim > similarity) & (sim != 1):

                        if (show == True):
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)
                        arr_uniq_copy = replace_frequent_uniq(name1,name2,arr_uniq_copy,arr_original) 

                    elif (sim > similarity_ask) & (sim != 1):

                        print('Are they the same? (n if not) -> ', name1, " <<>> ",name2, " (", sim, ")")
                        x = input()

                        if x != 'n':
                            arr_uniq_copy = replace_frequent_uniq(name1, name2, arr_uniq_copy, arr_original)
                            print("Replacing all the values with",name2,"by", name1,". Similarity: ",sim)
                            
                            
    
    # We create the table
    rep_table = pd.DataFrame({'original':arr_uniq,'most_freq':arr_uniq_copy})
    
    if show:
        display(rep_table)
    
    ls_orig_rep = [rep_table[rep_table.original == value].most_freq.values[0] for value in orig_ls]
    
    return ls_orig_rep

In [171]:
ls = ['a1','a2','a3','b', 'c1', 'a', 'd', 'b1', 'c', 'd', 'c', 'a1', 'b','a','a1','a2','a','b1',
     'a1','a2','a3','b', 'c2', 'a', 'd', 'b2', 'c', 'd', 'c', 'a2', 'b','a','a1','a2','a','b',
     'a1','a2','a3','b', 'c3', 'a', 'd', 'b1', 'c', 'd', 'c', 'a1', 'b','a','a1','a2','a','b2']

In [172]:
ans = replace_frequent_with_table(ls,similarity = 0.5, show = True)

Replacing all the values with a by a1 . Similarity:  0.6666666666666666
Replacing all the values with a by a2 . Similarity:  0.6666666666666666
Replacing all the values with a by a3 . Similarity:  0.6666666666666666
Replacing all the values with b1 by b . Similarity:  0.6666666666666666
Replacing all the values with b2 by b . Similarity:  0.6666666666666666
Replacing all the values with c by c1 . Similarity:  0.6666666666666666
Replacing all the values with c2 by c . Similarity:  0.6666666666666666
Replacing all the values with c3 by c . Similarity:  0.6666666666666666


Unnamed: 0,original,most_freq
0,a1,a
1,a2,a
2,a3,a
3,b,b
4,c1,c
5,a,a
6,d,d
7,b1,b
8,c,c
9,c2,c


Lets see how fast it is:

In [162]:
%%time
crag_sim = replace_frequent_with_table(orig_ls = crag_punct,
                                       show = False)

CPU times: user 45.2 s, sys: 0 ns, total: 45.2 s
Wall time: 45.3 s


#### ...we see that we saved a lot of time with this table method

In [105]:
myset5 = set(crag_sim)
len(myset5)

1595

In [106]:
pd.Series(crag_sim).value_counts().head(10)

plan  skjutvallen    23
sektor b             16
i                    16
vallberget           13
vo                   13
nacka                11
nya                  11
stenen               11
block                10
fruberget            10
dtype: int64

A big improvement respect the original:

In [107]:
pd.Series(crags).value_counts().head(10)

Sjöända                1
Hammardammen           1
Born                   1
Getberget              1
IP-blocket/Borlänge    1
Kungsblocket           1
Lillgalten             1
Traversblocken         1
Motörheadblocket       1
Bladmyran/Gävle        1
dtype: int64

We now put everything inside the text_processor class

In [108]:
crag_sim = txt.Replace_similar_array(arr = sorted(myset5))
myset6 = set(crag_sim)
len(myset6)

1595

## · Language check

In [109]:
from polyglot.detect import Detector
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentiment_analysis_spanish import sentiment_analysis

In [110]:
mixed_text = "La via es muy buena"
det = Detector(mixed_text, quiet=True)
print(det.language)

name: Spanish     code: es       confidence:  95.0 read bytes:  1280


In [111]:
def Language_check(mixed_text, lang_check = 'en'):
    '''
    This function checks if the language of the text is the expected

    inputs:
    - mixef_text => text to analyze
    - lang_check => the language we want to validate

    output:
    boolean, true or false
    '''



    det = Detector(mixed_text, quiet=True)
    lang = det.languages[0].code
    conf = det.languages[0].confidence

    if (lang == lang_check) and (conf > 0.5):
        return True

    return False

In [112]:
Language_check("That was a good route",'en')

True

Now we use the class text_processor

In [113]:
txt.Language_check("La via es buenisima, y lo demas tambien",'es')

True

## · Sentiment

This function will messure the sentiment of a comment

In [114]:
analyzer_en = SentimentIntensityAnalyzer()
analyzer_es = sentiment_analysis.SentimentAnalysisSpanish()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [115]:
analyzer_en.polarity_scores("Best climb of my life")

{'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'compound': 0.6369}

In [116]:
analyzer_es.sentiment("Una escalada muy mala")

0.0004283214610584631

In [117]:
# English
def English_sentiment(sentence, analyzer_en, show = False):
    '''
    This function analyzes a sentence in english to see if it is possitive or negative

    inputs:
    - sentence => text to analyze
    - show => if its True then it will show the results 

    output:
    returns a float between +-1 whith the result 
    '''   


    vs = analyzer_en.polarity_scores(sentence)
    if (vs['compound'] != 0) and show:
        print( '----------------------------------------------' )
        print(sentence)
        print(vs['compound'])

    return vs['compound']

#Spanish
def Spanish_sentiment(sentence, analyzer_es, show = False):
    '''
    This function analyzes a sentence in spanish to see if it is possitive or negative

    inputs:
    - sentence => text to analyze
    - show => if its True then it will show the results 

    output:
    returns a float between +-1 whith the result 
    '''   

    vs = analyzer_es.sentiment(sentence)*2-1
    if (vs != 0) and show:
        print( '----------------------------------------------' )
        print(sentence)
        print(vs)

    return vs

In [118]:
English_sentiment("Good route", analyzer_en, show = True)

----------------------------------------------
Good route
0.4404


0.4404

In [119]:
Spanish_sentiment("La via una maravilla", analyzer_es, show = True)

----------------------------------------------
La via una maravilla
0.9938294856508776


0.9938294856508776

### Language check + Sentiment

In [120]:
def comment_sentiment(comment, analyzer_en, analyzer_es, show = False):
    '''
    This functions takes a comment, checks the language and if its english it checks the sentiment
    input:
    - comment -> text to analyze
    - show -> if you want to show results
    
    output:
    sentiment is a value between +-1
    ''' 
    
    try:
        # Check if its in english
        is_english = Language_check(comment)

        if is_english:
            # Check the sentiment
            return Sentiment_analyzer(comment, analyzer_en, show)

        # Works for both
        is_spanish = Language_check(comment,'es')
        is_italian = Language_check(comment,'it')

        if is_spanish or is_italian:
            # Check the sentiment
            return Spanish_sentiment(comment, analyzer_es, show)            

        return 0
        
    except:
        return 0

In [121]:
comment_sentiment("Good route, guys", analyzer_en, analyzer_es, show = True)

0

In [122]:
comment_sentiment("La via una maravilla y lo demas", analyzer_en, analyzer_es, show = True)

----------------------------------------------
La via una maravilla y lo demas
0.989722003917165


0.989722003917165

Now we place it into the <b>class</b> in order to be able to use it in the future

In [123]:
txt.Language_check("Good route, guys, it was amazing")

True

In [124]:
txt.English_sentiment("Good route, guys, it was amazing")

0.7717

In [125]:
txt.comment_sentiment("Good route, guys, it was amazing", show = True)

----------------------------------------------
Good route, guys, it was amazing
0.7717


0.7717

In [126]:
txt.comment_sentiment("La via una maravilla y lo demas", show = True)

----------------------------------------------
La via una maravilla y lo demas
0.989722003917165


0.989722003917165