# Libraries

In [16]:
import math
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from unidecode import unidecode

# Data Loading

In [17]:
csv_filename = "ARES2_EJECUCION_ACTIVIDADES.csv"
data = pd.read_csv(csv_filename)
data

Unnamed: 0,DESCRIPCION,CODIGO_ETAPA,DURACION_HORAS
0,Vacaciones,VAC,8.0
1,Vacaciones,VAC,8.0
2,Documentación proceso multipago bancario ajust...,COCOD,3.0
3,Documentación proceso acumulación pisos y paso...,COCOD,3.0
4,Documentación proceso acumulación pisos y cont...,COCOD,2.0
...,...,...,...
52851,"Reunión daily, Documentando el código desarrol...",COCOD,3.0
52852,Generando los test unitarios a los servicios y...,COCOD,2.0
52853,"Homologando las colecciones de postman, optimi...",COCOD,3.0
52854,Validación de los datos de parametrización e i...,TRCON,2.0


# Data Cleaning

In [18]:
def get_all_characters() -> list:
    """
    @returns all the characters relevant for the model. Right now it is all the printable characters
    """
    all_characters: list = []
    # all_letters: list = list(string.ascii_letters)
    # all_digits = ["0","1","2","3","4","5","6","7","8","9"]
    printable_characters = list(string.printable)
    all_characters = printable_characters
    return all_characters


print(get_all_characters())


def get_column_labels():
    return {
        "CODIGO_ETAPA": "CODIGO_ETAPA",
        "DESCRIPCION": "DESCRIPCION",
        "DURACION_HORAS": "DURACION_HORAS",
    }
def get_all_stages(df: pd.DataFrame):
    res = df[get_column_labels()["CODIGO_ETAPA"]].unique() 
    return res
get_all_stages(data)


def get_relevant_columns_names() -> list:
    column_labels = get_column_labels()
    relevant_columns_names = [column_labels["CODIGO_ETAPA"], column_labels["DESCRIPCION"], column_labels["DURACION_HORAS"]]
    return relevant_columns_names


def get_clean_text(text: str) -> str:
    """
    @return string with only relevant characters to the model
    """
    all_characters = get_all_characters()
    text = unidecode(text)
    character_list = list(text)

    for char in character_list:
        if char not in all_characters:
            text = text.replace(char, "")
    return text


def get_relevant_columns(df: pd.DataFrame):
    rcn = get_relevant_columns_names()
    return df[rcn]


['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' ', '\t', '\n', '\r', '\x0b', '\x0c']


In [19]:
data = get_relevant_columns(data)
data[get_column_labels()['DESCRIPCION']] = data[get_column_labels()['DESCRIPCION']].apply(get_clean_text) 
data

Unnamed: 0,CODIGO_ETAPA,DESCRIPCION,DURACION_HORAS
0,VAC,Vacaciones,8.0
1,VAC,Vacaciones,8.0
2,COCOD,Documentacion proceso multipago bancario ajust...,3.0
3,COCOD,Documentacion proceso acumulacion pisos y paso...,3.0
4,COCOD,Documentacion proceso acumulacion pisos y cont...,2.0
...,...,...,...
52851,COCOD,"Reunion daily, Documentando el codigo desarrol...",3.0
52852,COCOD,Generando los test unitarios a los servicios y...,2.0
52853,COCOD,"Homologando las colecciones de postman, optimi...",3.0
52854,TRCON,Validacion de los datos de parametrizacion e i...,2.0


# Data Mining

## Most used stage

In [20]:
#count each stage occurance
counts = data[get_column_labels()["CODIGO_ETAPA"]].value_counts()
counts.head(10)

COCOD      20033
ERENT       4972
APEJE       2884
APSEG       1909
PRSIS       1687
ASEJE       1395
COAJU       1370
EMPALME     1201
COREV       1177
ASSEG       1141
Name: CODIGO_ETAPA, dtype: int64

In [21]:
def get_chosen_stages():
    chosen_stages = [
        "COCOD",
        "ERENT",
        "APEJE",
        "APSEG",
        "PRSIS",
        "ASEJE",
        "COAJU",
        "EMPALME",
        "COREV",
        "ASSEG"
    ]
    return chosen_stages

row_condition = data["CODIGO_ETAPA"].isin(get_chosen_stages())
data = data[row_condition]
#data =data["CODIGO_ETAPA"==]
num_val_samples = int(data.shape[0] * 0.2)
train_data, test_data = train_test_split(data, test_size=num_val_samples, shuffle=True, random_state=6)
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

test_data

Unnamed: 0,CODIGO_ETAPA,DESCRIPCION,DURACION_HORAS
47751,COCOD,Se siguio trabajando en los ajustes del metodo...,2.00
49896,COCOD,"Agregar campo a tabla y reporte, ajuste respon...",2.25
43311,APSEG,Daily ASW,0.25
32612,COCOD,REvision caso de uso Emit retry,2.83
8749,ASEJE,Realizar estimacion de CO_1024_ANT_Estimacion ...,1.50
...,...,...,...
3841,APEJE,Ajuste rechazos de los meses de junio y julio ...,3.00
37052,APEJE,"Sesion de avance diario, gestion del proyecto.",3.00
10178,COCOD,Ajuste modal vehiculos #207267,1.00
18129,ERENT,Daily Tecnico Clausula Penal con el grupo de S...,0.50


# Model Creation for Stage Prediction

## Strings operations

In [22]:

def get_number_of_characters(s: str)->int:
    return len(s.replace(" ", ""))

def get_character_index(s: str)->int:
    all_characters: list = get_all_characters()
    index = all_characters.index(s)
    return index

## Biagram

In [23]:
# create matrix with biagram
def create_biagram() -> np.array:
    """
    @return a matrix filled with zeroes of size n_char x n_char
    """
    # The only characters we are going to consider.
    all_characters: list = get_all_characters()
    n_characters = len(all_characters)
    # create the matrix of size n_characters
    biagram: np.array = np.zeros((n_characters, n_characters))
    return biagram


def get_stage_descriptions(df: pd.DataFrame, stage: str) -> pd.DataFrame:
    """
    @input
        df: Dataframe with all the information.
        stage: stage to extract
    @return list with all the descriptions of the given stage
    """
    column_name: str = get_column_labels()["CODIGO_ETAPA"]
    mask = df[column_name] == stage
    return df[mask][get_column_labels()["DESCRIPCION"]]


def get_character_count(descriptions: pd.DataFrame) -> dict:
    """
    @input description: df with all the description of a stage.
    @return dictionary with character count of all the descriptions in a stage. 
    """
    all_characters = get_all_characters()
    character_count: dict = {}
    for description in descriptions:
        for character in all_characters:
            character_count[character] = description.count(character)
    return character_count


def get_filled_biagram(descriptions: pd.DataFrame) -> np.array:
    """
    @input list of all the descriptions for a stage
    @return matrix with character count, and the count of the next character
    """
    # create the matrix of size n_characters
    biagram: np.array = create_biagram()
    for description in descriptions:
        for index in range(0, len(description) - 1):
            # check current and next character
            current_character = description[index]
            current_character_index = get_character_index(current_character)

            next_character = description[index + 1]
            next_character_index = get_character_index(next_character)

            current_biagram_value = biagram[current_character_index][
                next_character_index
            ]
            biagram[current_character_index][next_character_index] = (
                current_biagram_value + 1
            )
    return biagram


## Probability Matrix

In [24]:
def normalize_probabilities(character_count: int, total_characters: int)->float:
    return character_count/total_characters

def get_marginal_probabilities(character_count: dict, text:str)->np.array:
    """
    @input 
        character_count: Dictionary with occurences of each character
        text: the text to analyse
    """
    character_probabilities = np.zeros(len(string.ascii_lowercase))
    n_characters = get_number_of_characters(text)
    for char in character_count.keys():
        index = list(character_count.keys()).index(char)
        character_probabilities[index] = normalize_probabilities(character_count[char], n_characters)+0.00001
    return character_probabilities

def get_cond_probabilities(joint_probability: np.array,marginal_probability: np.array)->np.array:
    conditional_probability = (joint_probability.T/marginal_probability).T
    # normalized matrix
    conditional_probability = conditional_probability/conditional_probability.sum() 
    prob_sum = conditional_probability.sum()
    print(f"prob_sum = {prob_sum}")
    return conditional_probability

def get_joint_probabilities(characters_count: np.array)->np.array:
    all_characters = get_all_characters()
    # + len(all_characters) to account for repetitions (aa, bb, cc, dd, etc.)
    n_pairs = characters_count.sum()
    joint_probability_matrix = create_biagram()
    prob_sum = 0
    for i, row in enumerate(characters_count):
        for j,count in enumerate(characters_count):
            joint_probability_matrix[i][j] = normalize_probabilities(characters_count[i][j], n_pairs)
            prob_sum = prob_sum + joint_probability_matrix[i][j]
    print(f"prob_sum = {prob_sum}")
    return joint_probability_matrix

In [25]:
def graph_matrix_heatmap(matrix: np.array, title="Heatmap"):
    all_characters = get_all_characters()
    # fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(matrix, cmap="Blues" ,annot=False, xticklabels=False, yticklabels=False)
    plt.title(title)
    plt.xticks(rotation=0)
    plt.show()

## Model Creation

In [30]:
def language_identifier(text:str, matrix_probabilities: np.array)-> str:

    chosen_stages = get_chosen_stages()
    probs = np.zeros(10)
    clean_text = get_clean_text(text)
    for index in range(0, len(clean_text)-1):
      current_character = clean_text[index]
      next_character = clean_text[index+1]
      
      current_character_index = get_character_index(current_character)
      next_character_index = get_character_index(next_character)

      prob_vector: np.array = np.zeros((10))
      for idx in range(0,10):
        prob_vector[idx] = matrix_probabilities[idx][current_character_index][next_character_index]

      probs = probs + np.log(prob_vector+0.0000001)
    highest_prob_index = np.argmax(probs)
    res = chosen_stages[highest_prob_index]
    return res


# Model implementation for Stage prediction

## COCOD

### Descriptions

In [27]:

cocod_descriptions = get_stage_descriptions(train_data, "COCOD")
print(cocod_descriptions.head(10))

11430    Se trabaja en el proyecto 998, en el entregabl...
17775              Configurar reporte Multiples Documentos
14137                                                Daily
33249             Construccion en el modulo de comunicados
35043           PBI1129478: Se modifica la etl de tarjetas
24891                     Construccion diagramas de clases
49755                                    Ajuste paginacion
36302    Reunion con el equipo para revisar el servicio...
34850    Task 208762: Construir el archivo de salida po...
22400    211671 TK_SOPORTE_ASA_Synergia\nsoporte servic...
Name: DESCRIPCION, dtype: object


### Random Testing

In [28]:

def get_all_probability_matrices():
    all_characters = get_all_characters()
    probality_cube:np.array = np.zeros((10,len(all_characters),len(all_characters)))
    for index, stage in enumerate(get_chosen_stages()):
        stage_descriptions = get_stage_descriptions(train_data,stage)
        STAGE_dist = get_filled_biagram(stage_descriptions)
        STAGE_dist = get_joint_probabilities(STAGE_dist)
        probality_cube[index] = STAGE_dist
    return probality_cube
p_cube = get_all_probability_matrices()

prob_sum = 1.000000000000012
prob_sum = 1.0000000000000087
prob_sum = 0.9999999999999907
prob_sum = 0.999999999999993
prob_sum = 0.9999999999999928
prob_sum = 1.0000000000000058
prob_sum = 0.9999999999999918
prob_sum = 1.0000000000000002
prob_sum = 0.9999999999999878
prob_sum = 0.9999999999999863


# Testing

In [32]:
def get_stage_index(stage: str)->int:
    all_stages = get_chosen_stages()
    return all_stages.index(stage)

#It's test day!
fails: np.array = np.zeros((10))
successes: np.array = np.zeros((10))
prob_cube = get_all_probability_matrices()

for row in test_data.iterrows():

    desc = row[1]["DESCRIPCION"]
    real_stage = row[1]["CODIGO_ETAPA"]
    pred = language_identifier(desc ,prob_cube)
    idx = get_stage_index(real_stage)
    if not (pred == real_stage):
        fails[idx] = fails[idx] + 1
    else:
        successes[idx] = successes[idx] + 1
fails_sum = fails.sum()
successes_sum = successes.sum()
print(f"Fails = {fails_sum}")
print(f"Success = {successes_sum}")
print(f"Accuraccy= {successes_sum/(successes_sum+fails_sum)}")
    

prob_sum = 1.000000000000012
prob_sum = 1.0000000000000087
prob_sum = 0.9999999999999907
prob_sum = 0.999999999999993
prob_sum = 0.9999999999999928
prob_sum = 1.0000000000000058
prob_sum = 0.9999999999999918
prob_sum = 1.0000000000000002
prob_sum = 0.9999999999999878
prob_sum = 0.9999999999999863
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0: 10, 1: 100
0:

KeyboardInterrupt: 

# Model Creation for Duration Prediction

## Model Implementation for duration prediction