# Extracción de texto en cadenas de caracteres

In [1]:
import pyparsing as pp

In [2]:
#
# Ejemplo de un texto
#
# 09/04/2004  Virginia        44   Temple             14
# 09/04/2004  LSU             22   Oregon State       21
# 09/09/2004  Troy State      24   Missouri           14
# 01/02/2003  Florida State  103   University of Miami 2
#


#
# Gramática básica para capturar los datos
#
num = pp.Word(pp.nums)
date = num + "/" + num + "/" + num
schoolName = pp.OneOrMore( pp.Word(pp.alphas) )
score = pp.Word(pp.nums)
schoolAndScore = schoolName + score
gameResult = date + schoolAndScore + schoolAndScore

tests = """\
09/04/2004  Virginia        44   Temple             14
09/04/2004  LSU             22   Oregon State       21
09/09/2004  Troy State      24   Missouri           14
01/02/2003  Florida State  103   University of Miami 2""".splitlines()

for test in tests:
    stats = gameResult.parseString(test)
    print(stats.asList())

['09', '/', '04', '/', '2004', 'Virginia', '44', 'Temple', '14']
['09', '/', '04', '/', '2004', 'LSU', '22', 'Oregon', 'State', '21']
['09', '/', '09', '/', '2004', 'Troy', 'State', '24', 'Missouri', '14']
['01', '/', '02', '/', '2003', 'Florida', 'State', '103', 'University', 'of', 'Miami', '2']


In [3]:
#
# Combina los strings para las fechas
# usando Combine
#
num = pp.Word(pp.nums)
date = pp.Combine(num + "/" + num + "/" + num)
schoolName = pp.OneOrMore( pp.Word(pp.alphas) )
score = pp.Word(pp.nums)
schoolAndScore = schoolName + score
gameResult = date + schoolAndScore + schoolAndScore

tests = """\
09/04/2004  Virginia        44   Temple             14
09/04/2004  LSU             22   Oregon State       21
09/09/2004  Troy State      24   Missouri           14
01/02/2003  Florida State  103   University of Miami 2""".splitlines()

for test in tests:
    stats = gameResult.parseString(test)
    print(stats.asList())


['09/04/2004', 'Virginia', '44', 'Temple', '14']
['09/04/2004', 'LSU', '22', 'Oregon', 'State', '21']
['09/09/2004', 'Troy', 'State', '24', 'Missouri', '14']
['01/02/2003', 'Florida', 'State', '103', 'University', 'of', 'Miami', '2']


In [4]:
#
# Combina los strings de los nombres
#
num = pp.Word(pp.nums)
date = pp.Combine(num + "/" + num + "/" + num)
schoolName = pp.OneOrMore( pp.Word(pp.alphas) )
schoolName.setParseAction( lambda tokens: " ".join(tokens) ) # <- modificación
score = pp.Word(pp.nums)
schoolAndScore = schoolName + score
gameResult = date + schoolAndScore + schoolAndScore

tests = """\
09/04/2004  Virginia        44   Temple             14
09/04/2004  LSU             22   Oregon State       21
09/09/2004  Troy State      24   Missouri           14
01/02/2003  Florida State  103   University of Miami 2""".splitlines()

for test in tests:
    stats = gameResult.parseString(test)
    print(stats.asList())



['09/04/2004', 'Virginia', '44', 'Temple', '14']
['09/04/2004', 'LSU', '22', 'Oregon State', '21']
['09/09/2004', 'Troy State', '24', 'Missouri', '14']
['01/02/2003', 'Florida State', '103', 'University of Miami', '2']


In [5]:
#
# Validación de las fechas
#
num = pp.Word(pp.nums)
date = pp.Combine(num + "/" + num + "/" + num)
schoolName = pp.OneOrMore( pp.Word(pp.alphas) )
schoolName.setParseAction( lambda tokens: " ".join(tokens) ) # <- modificación
score = pp.Word(pp.nums)
schoolAndScore = schoolName + score
gameResult = date + schoolAndScore + schoolAndScore


import time


def validateDateString(tokens):
    try:
        time.strptime(tokens[0], "%m/%d/%Y")
    except ValueError:
        raise pp.ParseException("Invalid date string (%s)" % tokens[0])

date.setParseAction(validateDateString)        
       
# se modifica la primera linea para generar el error    
tests = """\
19/04/2004  Virginia        44   Temple             14
09/04/2004  LSU             22   Oregon State       21
09/09/2004  Troy State      24   Missouri           14
01/02/2003  Florida State  103   University of Miami 2""".splitlines()

for test in tests:
    try: 
        stats = gameResult.parseString(test)
        print(stats.asList())
    except:
        print("Date error in: " + test )

Date error in: 19/04/2004  Virginia        44   Temple             14
['09/04/2004', 'LSU', '22', 'Oregon State', '21']
['09/09/2004', 'Troy State', '24', 'Missouri', '14']
['01/02/2003', 'Florida State', '103', 'University of Miami', '2']


In [6]:
#
# Se convierte en entero el score y se agrupa 
#
num = pp.Word(pp.nums)
date = pp.Combine(num + "/" + num + "/" + num)
schoolName = pp.OneOrMore( pp.Word(pp.alphas) )
schoolName.setParseAction( lambda tokens: " ".join(tokens) ) 
score = pp.Word(pp.nums).setParseAction( lambda tokens : int(tokens[0]) ) # <- modificación
schoolAndScore = pp.Group(schoolName + score)
gameResult = date + schoolAndScore + schoolAndScore

import time
def validateDateString(tokens):
    try:
        time.strptime(tokens[0], "%m/%d/%Y")
    except ValueError:
        raise pp.ParseException("Invalid date string (%s)" % tokens[0])

date.setParseAction(validateDateString)        
       
# se modifica la primera linea para generar el error    
tests = """\
09/04/2004  Virginia        44   Temple             14
09/04/2004  LSU             22   Oregon State       21
09/09/2004  Troy State      24   Missouri           14
01/02/2003  Florida State  103   University of Miami 2""".splitlines()

for test in tests:
    stats = gameResult.parseString(test)
    print(stats.asList())


['09/04/2004', ['Virginia', 44], ['Temple', 14]]
['09/04/2004', ['LSU', 22], ['Oregon State', 21]]
['09/09/2004', ['Troy State', 24], ['Missouri', 14]]
['01/02/2003', ['Florida State', 103], ['University of Miami', 2]]


In [7]:
#
# Introducción de textos explicativos
#
for test in tests:
    stats = gameResult.parseString(test)
    if stats[1][1] != stats[2][1]:
        if stats[1][1] > stats[2][1]:
            result = "won by " + stats[1][0]
        else:
            result = "won by " + stats[2][0]
    else:
        result = "tied"
    print(
        "{:s} {:s}({:d}) {:s}({:d}), {:s}".format(
            stats[0], stats[1][0], stats[1][1], stats[2][0], stats[2][1], result
        )
    )

09/04/2004 Virginia(44) Temple(14), won by Virginia
09/04/2004 LSU(22) Oregon State(21), won by LSU
09/09/2004 Troy State(24) Missouri(14), won by Troy State
01/02/2003 Florida State(103) University of Miami(2), won by Florida State


In [8]:
#
# Manejo de nombres para aumentar la legibilidad
#
num = pp.Word(pp.nums)
date = pp.Combine(num + "/" + num + "/" + num)
schoolName = pp.OneOrMore(pp.Word(pp.alphas))
schoolName.setParseAction(lambda tokens: " ".join(tokens))
score = pp.Word(pp.nums).setParseAction(
    lambda tokens: int(tokens[0])
) 
schoolAndScore = pp.Group(
    schoolName.setResultsName("school") + score.setResultsName("score")
)
gameResult = (
    date.setResultsName("date")
    + schoolAndScore.setResultsName("team1")
    + schoolAndScore.setResultsName("team2")
)

date.setParseAction(validateDateString)


for test in tests:
    stats = gameResult.parseString(test)
    if stats.team1.score != stats.team2.score:
        if stats.team1.score > stats.team2.score:
            result = "won by " + stats.team1.school
        else:
            result = "won by " + stats.team2.school
    else:
        result = "tied"
    print(
        "{:s} {:s}({:d}) {:s}({:d}), {:s}".format(
            stats.date,
            stats.team1.school,
            stats.team1.score,
            stats.team2.school,
            stats.team2.score,
            result,
        )
    )

09/04/2004 Virginia(44) Temple(14), won by Virginia
09/04/2004 LSU(22) Oregon State(21), won by LSU
09/09/2004 Troy State(24) Missouri(14), won by Troy State
01/02/2003 Florida State(103) University of Miami(2), won by Florida State


In [9]:
#
# Se puede usar dump() para imprimir la info
# y revisar
#
print(stats.dump())

['01/02/2003', ['Florida State', 103], ['University of Miami', 2]]
- date: '01/02/2003'
- team1: ['Florida State', 103]
  - school: 'Florida State'
  - score: 103
- team2: ['University of Miami', 2]
  - school: 'University of Miami'
  - score: 2
[0]:
  01/02/2003
[1]:
  ['Florida State', 103]
  - school: 'Florida State'
  - score: 103
[2]:
  ['University of Miami', 2]
  - school: 'University of Miami'
  - score: 2
