In [1]:
#IMPORT STUFF RELATED TO DATA MANAGEMENT
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
import seaborn as sns
import re

limit = 'limit '+ str(100000)

In [2]:
database_path = '../NEST.sqlite'

conn = sql.connect(database_path) # nest con filtro de pandemia
# conn = sql.connect('NEST_Pandemia.sqlite') # nest sin filtro de pandemia

# extrayendo placa y el año de la fecha de inicio de vigencia de la poliza
# este analisis es por placa, si se quisiera ver persistencia del cliente solo tendríamos que cambiar "numplaca" por "codcli"
query = "select NUMPLACA, strftime('%Y', FECINIVALID ) ano from nest where tipoveh= 'P' and FECINIVALID > '2010-01-01' group by numplaca, ano " 
basico = pd.read_sql_query(query, conn)

In [3]:
# limpieza de placas usando regex 

# ^ - This symbol anchors the pattern to the beginning of the string.
# (?: ...) - This is a non-capturing group, used to group parts of the pattern together.
# [A-H0-9]{1} - Match exactly one character that is either an uppercase letter from A to H or a digit from 0 to 9.
# [A-Z0-9]{1} - Match exactly one character that is either an uppercase letter from A to Z or a digit from 0 to 9.
# [0-9]{4} - Match exactly four digits (0 to 9).
# $ - This symbol anchors the pattern to the end of the string.

regex = r"^(?:[A-I0-9]{1}[A-Z0-9]{1}[0-9]{4})$"
basico.dropna(inplace=True)
basico = basico[basico.NUMPLACA.str.match(regex)]

In [4]:
basico

Unnamed: 0,NUMPLACA,ano
5519,000000,2010
5520,000000,2011
5521,000000,2013
5522,000000,2016
5523,000000,2017
...,...,...
792940,GG0606,2022
792941,GG6977,2019
792942,GG6977,2022
792953,HE3634,2023


In [5]:
# export to_csv
basico.to_csv('basico.csv', index=False)

In [6]:
# creando binary table

basico = basico.sort_values(by=['NUMPLACA', 'ano'], ascending=[True, True])
#basico['ano'] = pd.to_numeric(basico['ano'], errors='coerce') # Convert the 'ano' column to numeric (integer or float) data type
basico['ano'] = basico['ano'].astype(int)
basico['DETECTOR CORTE PLACA'] = (basico['NUMPLACA'] == basico['NUMPLACA'].shift()).astype(int)
basico['DETECTOR CORTE AÑO'] = (basico['ano'].diff() == 1).astype(int)
basico['COMBINADOR DE CORTES'] = (basico['DETECTOR CORTE PLACA'] + basico['DETECTOR CORTE AÑO'] == 2).astype(int)
basico.reset_index(drop=True, inplace=True)
basico.at[0, 'ACUMULADOR'] = 0

In [7]:
# Iterate through the 'basico' DataFrame to calculate cumulative values.

for i in range(len(basico)):
    if i == 0:
        pass
    else:
        if basico['COMBINADOR DE CORTES'][i] == 1:
            value = basico['COMBINADOR DE CORTES'][i] + basico['ACUMULADOR'][i-1]
        else: 
            value = 0
        basico.at[i, 'ACUMULADOR'] = value

basico['ACUMULADOR'] = basico['ACUMULADOR'].astype(int)

In [8]:
# creando binary table

#basico['ACUMULADOR'] = basico.apply(lambda row: row['COMBINADOR DE CORTES'] + row['ACUMULADOR'].shift() if row['COMBINADOR DE CORTES'] == 1 else 0, axis=1)
#basico['ACUMULADOR'] = basico['COMBINADOR DE CORTES'].eq(1).mul(basico['COMBINADOR DE CORTES'] + basico['ACUMULADOR'].shift(fill_value=0))
basico['PRESENTACION'] = basico['ACUMULADOR'] + 1
# Assuming your DataFrame is named 'basico', and the columns are 'AÑO INICIAL' (H), 'ano' (B), 'PRESENTACION' (G), and 'ACUMULADOR' (F)
basico['AÑO INICIAL'] = basico.apply(lambda row: row['ano'] if row['PRESENTACION'] == 1 else row['ano'] - row['ACUMULADOR'], axis=1)

In [9]:
# hacer pivot table
triangulo = basico.pivot_table(values="NUMPLACA", index="AÑO INICIAL", columns="ACUMULADOR", aggfunc="count")
#triangulo.reset_index(drop=True)
triangulo = pd.DataFrame(triangulo.to_records())

In [10]:
# calcular probabilidades

columnas = triangulo.columns.tolist()[1:]
P_triangulo = pd.DataFrame(triangulo['AÑO INICIAL'])

for i in range(len(columnas)-1):  # Realizar 10 iteraciones para manejar todas las divisiones
    P_triangulo[str(i)] =  triangulo[str(i + 1)] / triangulo[str(i)] 
P_triangulo

Unnamed: 0,AÑO INICIAL,0,1,2,3,4,5,6,7,8,9
0,2010,0.610293,0.627921,0.772141,0.789928,0.79292,0.809256,0.822157,0.835516,0.846556,0.133051
1,2011,0.328787,0.752195,0.781441,0.777446,0.816042,0.806357,0.809489,0.838593,0.162366,
2,2012,0.791896,0.74484,0.739637,0.785894,0.820112,0.83488,0.833821,0.112281,,
3,2013,0.761715,0.673502,0.762985,0.79734,0.819921,0.81927,0.125639,,,
4,2014,0.781818,0.740766,0.787319,0.829554,0.83459,0.117448,,,,
5,2015,0.85764,0.776234,0.833736,0.847267,0.074897,,,,,
6,2016,0.835082,0.78071,0.829968,0.128251,,,,,,
7,2017,0.851316,0.772508,0.067629,,,,,,,
8,2018,0.786176,0.094108,,,,,,,,
9,2019,0.143676,,,,,,,,,
