# Analisis de Datos de la I15 

## Data of the I15 NB

In [None]:
import pandas as pd
import numpy as np
pd.set_option("precision", 3)

In [None]:
pd.options.display.max_rows = 100

### Carga de la data.
La base de datos SQL posee datos del 01-01-2018 al 31-12-2019 de todos los sensores (a excepción de algunos parecen no estar funcionando) que aparecen en la plataforma Bugatti FAST. De esta base de datos se exportó un dataset con los datos de los sensores pertenecientes a la I15_NB (I15 dirección norte). El dataset "lean" es el mismo dataset exportado eliminando las columnas de datos que no utilizo (Vease los drops que se encuentran comentados). Existen dos valores denominados "Invalid" -> [0, 1, 2, 3] y "Failure" -> [0 - 448] que aún no sé como utilizar porque no sé qué significan sus valores.

### Data loading
The SQL database has data from 01-01-2018 to 31-12-2019 of all the sensors that appear in the Bugatti FAST dashboard (except for some that seem not to be working). From this database a dataset was exported with data from sensors belonging to the I15_NB (I15 northbound). The "lean" dataset is the same dataset exported, but eliminating the columns of data that I do not use (see commented "drops"). There are two values named "Invalid" -> [0, 1, 2, 3] and "Failure" -> [0 - 448] that I still don't know how to use because I don't know what their values mean.

In [None]:
data_file_name = "datasets/la_vegas/i15_bugatti/bugatti_nb_data_lean.csv"
# data_file_name = "datasets/la_vegas/i15_bugatti/bugatti_nb_data.csv"

data = pd.read_csv(data_file_name)
# data = data.drop(columns=['Path', 'RoadIndex', 'RoadwayID', 'SegmentID', 'DeviceID',
#                           'Volume1', 'Volume2', 'Volume3', 'Volume4', 'Volume5', 'Volume6',
#                           'RoadType', 'Location', 'Polling_Period', 'DayOfWeek',
#                           'DateValue', 'HourIdx', 'Holiday'])
# data.to_csv('datasets/la_vegas/i15_bugatti/bugatti_nb_data_lean.csv', index=False)

data.head(10)

In [None]:
data.describe()

In [None]:
#print(data['Path'].unique())

### Invalid Evaluation

Invalid = 1

In [None]:
data_bad = data[data['Invalid'] == 1]
print(data_bad['Invalid'].count())
data_bad.head(10)

Invalid = 2

In [None]:
data_bad = data[data['Invalid'] == 2]
print(data_bad['Invalid'].count())
data_bad.head(10)

Invalid = 3

In [None]:
data_bad = data[data['Invalid'] == 3]
print(data_bad['Invalid'].count())
data_bad.head(10)

In [None]:
print(f"Number of Valid Values: {len(data['Invalid'][data['Invalid'] == 0])} ({(len(data['Invalid'][data['Invalid'] == 0])/len(data))*100}%)")
print(f"Number of Invalid Values: {len(data['Invalid'][data['Invalid'] != 0])} ({(len(data['Invalid'][data['Invalid'] != 0])/len(data))*100}%)")
print(f"Number of Invalid = 1: {len(data['Invalid'][data['Invalid'] == 1])} ({(len(data['Invalid'][data['Invalid'] == 1])/len(data))*100}%)")
print(f"Number of Invalid = 2: {len(data['Invalid'][data['Invalid'] == 2])} ({(len(data['Invalid'][data['Invalid'] == 2])/len(data))*100}%)")
print(f"Number of Invalid = 3: {len(data['Invalid'][data['Invalid'] == 3])} ({(len(data['Invalid'][data['Invalid'] == 3])/len(data))*100}%)")

print(f"Number Unique of Invalid Values: {len(data['Invalid'].unique())}")
hist = data['Invalid'].hist(bins=len(data['Invalid'].unique()))

Zoom on the Invalid values

In [None]:
hist = data['Invalid'][data['Invalid'] != 0].hist(bins=len(data['Invalid'].unique()) - 1)

In [None]:
#data['Invalid'][(data['Invalid'] == 2) & (data['Volume'] == 0) & (data['Occupancy'] == 0)] = 4
data_invalid2_special = data.loc[(data['Invalid'] == 2) & (data['Volume'] == 0) & (data['Occupancy'] == 0), 'Invalid']
data.loc[(data['Invalid'] == 2) & (data['Volume'] == 0) & (data['Occupancy'] == 0), 'Invalid'] = 0
print(f"Number of Invalid = 2 (Special case): {len(data_invalid2_special)} ({(len(data_invalid2_special)/len(data))*100}%)")

In [None]:
hist = data['Invalid'][data['Invalid'] != 0].hist(bins=len(data['Invalid'].unique()) - 1)

In [None]:
print(f"Number of Correct Values: {len(data['Failure'][data['Failure'] == 0])} ({(len(data['Failure'][data['Failure'] == 0])/len(data))*100}%)")
print(f"Number of Failure Values: {len(data['Failure'][data['Failure'] != 0])} ({(len(data['Failure'][data['Failure'] != 0])/len(data))*100}%)")

print(f"Number Unique of Failure Values: {len(data['Failure'].unique())}")

Se realiza un mapeo de los ID de los detectores a una serie de números del "0" a "N - 1" (N= cantidad de detectores = 57). Dicha serie está ordenada según la ubicación de los detectores en la autopista, siendo "0" el detector que se encuentra más al Sur y "N-1" el que se encuentra más al Norte.

The detector IDs are mapped to a series of numbers from "0" to "N - 1" (N= number of detectors = 57). This series is ordered according to the location of the detectors on the highway, with "0" being the southernmost detector and "N-1" being the northernmost.

In [None]:
detectors_nb_list = ['440.1.335',  '439.1.334',
  '439.2.333',
  '439.3.332',
  '438.1.331',
  '438.2.330',
  '438.3.329',
  '359.1.325',
  '358.1.325',
  '358.2.320',
  '358.3.319',
  '357.1.312',
  '357.2.311',
  '357.3.310',
  '356.1.309',
  '356.2.308',
  '355.1.156',
  '355.2.153',
  '355.3.155',
  '354.1.79',
  '354.2.144',
  '354.3.145',
  '32.1.142',
  '34.1.94',
  '39.2.88',
  '48.2.83',
  '49.1.82',
  '49.2.12',
  '49.3.15',
  '58.2.17',
  '59.1.18',
  '59.2.18',
  '70.2.21',
  '71.2.23',
  '72.1.22',
  '72.2.28',
  '89.1.28',
  '89.2.30',
  '97.1.33',
  '97.2.33',
  '97.3.38',
  '99.1.35',
  '110.1.41',
  '112.2.44',
  '113.2.45',
  '122.2.48',
  '124.2.49',
  '137.1.80',
  '138.1.53',
  '138.2.55',
  '146.2.238',
  '148.2.58',
  '149.2.240',
  '160.2.242',
  '396.1.243',
  '396.2.246',
  '396.3.246',
  '397.1.247',
  '397.2.248',
  '398.1.249',
  '398.2.251']
print(len(detectors_nb_list))
detector_id_map = {}
data_unique_detect_ID = data['DetectorID'].unique()

count_loss = 0
for i in range(len(detectors_nb_list)):
    if detectors_nb_list[i] in data_unique_detect_ID:
        detector_id_map[detectors_nb_list[i]] = i - count_loss
    else:
        count_loss += 1
print(f'Detectors lost = {count_loss}')

data_detect_ID = data['DetectorID']

data_detect_new_ID = pd.Series([detector_id_map[i] for i in data_detect_ID])
print(data_detect_new_ID)
data['DetectID'] = data_detect_new_ID

Se trasforma la columna 'DateTimeStamp' de string a un formato pd.datetime64 para poder ordenar los datos de forma cronológica y para poder analizar los períodos de muestreo.

The column 'DateTimeStamp' is transformed from string to a pd.datetime64 format to be able to sort the data chronologically and to be able to analyze the sampling periods.

In [None]:
data['DateTimeStamp'] = pd.to_datetime(data['DateTimeStamp'])
data = data.sort_values(by=['DateTimeStamp','DetectID'],ascending=[True, True])

date_time_obj = data['DateTimeStamp'].iloc[0]
print('Date:', date_time_obj.date())
print('Time:', date_time_obj.time())
print('Minute:', date_time_obj.time().minute)
print('Date-time:', date_time_obj)

print(data['DateTimeStamp'].unique())
print(len(data['DateTimeStamp'].unique()))

Se realiza un análisis de los tiempos de muestreo.

An analysis of sampling times is performed.

In [None]:
date_unq = pd.Series(data['DateTimeStamp'].unique())
date_rest = []
date_rare = []
date_no_15 = []
for i in range(len(date_unq)-1):
    if date_unq[i+1].time().hour == date_unq[i].time().hour:
        rest = date_unq[i+1].time().minute - date_unq[i].time().minute
    elif (date_unq[i+1].time().hour > date_unq[i].time().hour) or ((date_unq[i+1].time().hour == 0) and (date_unq[i].time().hour == 23)):
        rest = date_unq[i+1].time().minute + 60 - date_unq[i].time().minute
    else:
        rest = -1
        date_rare.append([date_unq[i+1], date_unq[i]])
        #print("algo raro")
    date_rest.append(rest)
    if rest != 15:
        date_no_15.append([rest, date_unq[i+1], date_unq[i]])

print(f'\nTotal DateTimes = {len(date_unq)} vs Total DateTimes in 2 years with a period of 15 min = {(60/15)*24*365*2} --> DateLoss = {100 - (100*(len(date_unq)/((60/15)*24*365*2)))}')
print(f'Periods other than 15 min = {len(date_no_15)}\n')

# Datos Raros
print(f'Very high periods (2h a 2 días) = {len(date_rare)}')
print(date_rare)

num = list(np.unique(date_rest))
count = np.zeros(len(num))
for i in date_no_15:
    if i[0] in num:
        count[num.index(i[0])] += 1
count[num.index(15)] = len(date_unq) - len(date_no_15)

fusion = []
for i in range(len(num)):
    fusion.append([num[i], count[i]])
print('\nPeriods vs number of samples')
print(fusion)

En primera instancia lo que se observa es que existen gaps o irregularidades en la frecuencia de muestreo de datos pues se tienen 63092 muestras en un plazo de 2 años, en vez de las 70080 que se deberían tener si se mantuviera un período de muestreo constante de 15min. De hecho más de la mitad de las muestras (38470) poseen un período de muestreo distinto a 15 minutos. De estas 38470 muestras, 24622 tienen un período de muestreo de 16 min, y 6627 lo tienen de 17 min, por lo que podemos decir que el 90% de las muestras se encuentra alrededor de los 15 min, con una desviación de +-2 min. El otro 10% varía muchisimo, llendo desde 1 minuto hasta casi 2 días en los caso más extremos.

Hay 21 casos con períodos de muestreo muy altos. La mayoría de estos se dan una vez al mes, siento el patro que más se repite: muestra ~22:30h y luego la siguiente muestra a las ~05:30h del día siguiente. Existe un sólo caso donde no hubo muestras por todo un día y parte del otro ([Timestamp('2018-05-10 07:46:46'), Timestamp('2018-05-08 22:17:57')] --> No hubo muestars el día 2018-05-09)

In the first instance we can observe that there are gaps or irregularities in the sampling periods, since there are 63.092 samples in a period of 2 years, instead of the 70.080 that should be had if a constant sampling period of 15 minutes were maintained. In fact, more than half of the samples (38.470) have a sampling period other than 15 minutes. Of these 38.470 samples, 24.622 have a sampling period of 16 min, and 6.627 have a sampling period of 17 min, so we can say that 90% of the samples are around 15 min, with a deviation of +-2 min. The other 10% varies a lot, ranging from 1 minute to almost 2 days in the most extreme cases.

There are 21 cases with very high sampling periods. The majority of these occur once a month, with the most repeated pattern being: a sample at ~22:30h and then the next sample at ~05:30h the following day. There is only one case where there were no samples for all day and part of the other ([Timestamp('2018-05-10 07:46:46'), Timestamp('2018-05-08 22:17:57')] --> No samples on 2018-05-09).

### Agrupamiento de la data
Se agrupa la data por DateTime y por DetectID. Esto se hace para que los valores de la variables para un detector sean el mean() de los valores de todas sus "Lanes".

### Data grouping
The data is grouped by DateTime and DetectID. This is done so that the values of the variables for a detector are the mean() of the values of all its "Lanes".

In [None]:
data = data.groupby(['DateTimeStamp','DetectID'], as_index=False).mean()

In [None]:
data.head(10)

In [None]:
print(data['DetectID'].unique())

In [None]:
data.describe()

In [None]:
data.describe(include=['datetime64[ns]', 'object'])

In [None]:
print(data.info())
print(f'Shape of the data = {data.shape}')
#print(data.columns)

In [None]:
print('NA Count:')
print(data.isna().sum())

In [None]:
print(f"Number of Valid Values: {len(data['Invalid'][data['Invalid'] == 0])} ({(len(data['Invalid'][data['Invalid'] == 0])/len(data))*100}%)")
print(f"Number of Invalid Values: {len(data['Invalid'][data['Invalid'] != 0])} ({(len(data['Invalid'][data['Invalid'] != 0])/len(data))*100}%)")
print(f"Number of Invalid = 1: {len(data['Invalid'][data['Invalid'] == 1])} ({(len(data['Invalid'][data['Invalid'] == 1])/len(data))*100}%)")
print(f"Number of Invalid = 2: {len(data['Invalid'][data['Invalid'] == 2])} ({(len(data['Invalid'][data['Invalid'] == 2])/len(data))*100}%)")
print(f"Number of Invalid = 3: {len(data['Invalid'][data['Invalid'] == 3])} ({(len(data['Invalid'][data['Invalid'] == 3])/len(data))*100}%)")

print(f"Number Unique of Invalid Values: {len(data['Invalid'].unique())}")
hist = data['Invalid'].hist(bins=len(data['Invalid'].unique()))

In [None]:
hist = data['Invalid'][data['Invalid'] != 0].hist(bins=len(data['Invalid'].unique()) - 1)

In [None]:
data = data[data['Invalid'] == 0]
print(data.shape)

In [None]:
print(f"Number of Correct Values: {len(data['Failure'][data['Failure'] == 0])} ({(len(data['Failure'][data['Failure'] == 0])/len(data))*100}%)")
print(f"Number of Failure Values: {len(data['Failure'][data['Failure'] != 0])} ({(len(data['Failure'][data['Failure'] != 0])/len(data))*100}%)")

print(f"Number Unique of Failure Values: {len(data['Failure'].unique())}")

In [None]:
data = data[data['Failure'] == 0]
print(data.shape)

El siguiente es un histograma que representa el número de muestra que se tienen de cada detector. (Recordar que el número total de muestras de DateTime = 63092)

The following is a histogram representing the number of samples you have from each detector. (Recall that the total number of samples of DateTime = 63092)

In [None]:
hist = data['DetectID'].hist(bins=len(data['DetectID'].unique()))

En la siguientes celdas se ven los números representados en el histograma en forma "total" y en forma de porcentaje con respecto al número total de muestras de DateTime.

In the next cells you see the numbers represented in the histogram in "total" form and as a "percentage" of the total number of DateTime samples.

In [None]:
print('Samples by detector "Totals"')
data.groupby('DetectID')['DateTimeStamp'].count()

In [None]:
print('Samples by detector "Percentage"')
detect_count_per = data.groupby('DetectID')['DateTimeStamp'].count()*100/63092
print(detect_count_per)

theshod = 64
print(f'Theshod = {theshod}')
valid_detect = detect_count_per[detect_count_per > theshod].index
print(f'Valid detectors = {len(valid_detect)}')
print(valid_detect)
hist = data['DetectID'][data['DetectID'].isin(valid_detect)].hist(bins=len(data['DetectID'].unique()))

In [None]:
data = data[data['DetectID'].isin(valid_detect)]
print(data.shape)
data.head(10)

## Data of the I15 SB

### Carga de la data.
La base de datos SQL posee datos del 01-01-2018 al 31-12-2019 de todos los sensores (a excepción de algunos parecen no estar funcionando) que aparecen en la plataforma Bugatti FAST. De esta base de datos se exportó un dataset con los datos de los sensores pertenecientes a la I15_SB (I15 dirección sur). El dataset "lean" es el mismo dataset exportado eliminando las columnas de datos que no utilizo (Vease los drops que se encuentran comentados). Existen dos valores denominados "Invalid" -> [0, 1, 2, 3] y "Failure" -> [0 - 448] que aún no sé como utilizar porque no sé qué significan sus valores.

### Data loading
The SQL database has data from 01-01-2018 to 31-12-2019 of all the sensors that appear in the Bugatti FAST dashboard (except for some that seem not to be working). From this database a dataset was exported with data from sensors belonging to the I15_NB (I15 northbound). The "lean" dataset is the same dataset exported, but eliminating the columns of data that I do not use (see commented "drops"). There are two values named "Invalid" -> [0, 1, 2, 3] and "Failure" -> [0 - 448] that I still don't know how to use because I don't know what their values mean.

In [None]:
data_file_name = "datasets/la_vegas/i15_bugatti/bugatti_sb_data_lean.csv"
# data_file_name = "datasets/la_vegas/i15_bugatti/bugatti_sb_data.csv"

data = pd.read_csv(data_file_name)
# data = data.drop(columns=['Path', 'RoadIndex', 'RoadwayID', 'SegmentID', 'DeviceID',
#                           'Volume1', 'Volume2', 'Volume3', 'Volume4', 'Volume5', 'Volume6',
#                           'RoadType', 'Location', 'Polling_Period', 'DayOfWeek',
#                           'DateValue', 'HourIdx', 'Holiday'])
# data.to_csv('bugatti_sb_data_lean.csv', index=False)

data.head(10)

In [None]:
data.describe()

In [None]:
#print(data['Path'].unique())

### Invalid Evaluation

Invalid = 1

In [None]:
data_bad = data[data['Invalid'] == 1]
print(data_bad['Invalid'].count())
data_bad.head(10)

Invalid = 2

In [None]:
data_bad = data[data['Invalid'] == 2]
print(data_bad['Invalid'].count())
data_bad.head(10)

Invalid = 3

In [None]:
data_bad = data[data['Invalid'] == 3]
print(data_bad['Invalid'].count())
data_bad.head(10)

In [None]:
print(f"Number of Valid Values: {len(data['Invalid'][data['Invalid'] == 0])} ({(len(data['Invalid'][data['Invalid'] == 0])/len(data))*100}%)")
print(f"Number of Invalid Values: {len(data['Invalid'][data['Invalid'] != 0])} ({(len(data['Invalid'][data['Invalid'] != 0])/len(data))*100}%)")
print(f"Number of Invalid = 1: {len(data['Invalid'][data['Invalid'] == 1])} ({(len(data['Invalid'][data['Invalid'] == 1])/len(data))*100}%)")
print(f"Number of Invalid = 2: {len(data['Invalid'][data['Invalid'] == 2])} ({(len(data['Invalid'][data['Invalid'] == 2])/len(data))*100}%)")
print(f"Number of Invalid = 3: {len(data['Invalid'][data['Invalid'] == 3])} ({(len(data['Invalid'][data['Invalid'] == 3])/len(data))*100}%)")

print(f"Number Unique of Invalid Values: {len(data['Invalid'].unique())}")
hist = data['Invalid'].hist(bins=len(data['Invalid'].unique()))

Zoom on the Invalid values

In [None]:
hist = data['Invalid'][data['Invalid'] != 0].hist(bins=len(data['Invalid'].unique()) - 1)

In [None]:
#data['Invalid'][(data['Invalid'] == 2) & (data['Volume'] == 0) & (data['Occupancy'] == 0)] = 4
data_invalid2_special = data.loc[(data['Invalid'] == 2) & (data['Volume'] == 0) & (data['Occupancy'] == 0), 'Invalid']
data.loc[(data['Invalid'] == 2) & (data['Volume'] == 0) & (data['Occupancy'] == 0), 'Invalid'] = 0
print(f"Number of Invalid = 2 (Special case): {len(data_invalid2_special)} ({(len(data_invalid2_special)/len(data))*100}%)")

In [None]:
hist = data['Invalid'][data['Invalid'] != 0].hist(bins=len(data['Invalid'].unique()) - 1)

In [None]:
print(f"Number of Correct Values: {len(data['Failure'][data['Failure'] == 0])} ({(len(data['Failure'][data['Failure'] == 0])/len(data))*100}%)")
print(f"Number of Failure Values: {len(data['Failure'][data['Failure'] != 0])} ({(len(data['Failure'][data['Failure'] != 0])/len(data))*100}%)")

print(f"Number Unique of Failure Values: {len(data['Failure'].unique())}")

In [None]:
print(f"Number of Failure Values: {len(data['Failure'][data['Failure'] != 0])}")
print(f"Number Unique of Failure Values: {len(data['Failure'].unique())}")

In [None]:
detectors_sb_list = ['395.2.106',
  '394.3.104',
  '390.1.252',
  '390.2.250',
  '390.3.249',
  '389.1.248',
  '389.2.247',
  '388.1.245',
  '388.2.244',
  '388.3.243',
  '161.1.242',
  '161.2.241',
  '155.1.240',
  '155.2.239',
  '156.2.57',
  '147.2.56',
  '142.1.55',
  '142.2.54',
  '132.1.80',
  '136.1.47',
  '136.2.47',
  '123.2.50',
  '117.1.46',
  '117.2.46',
  '117.3.43',
  '111.1.42',
  '102.1.40',
  '102.2.606',
  '102.3.37',
  '98.2.34',
  '98.3.32',
  '91.1.31',
  '91.2.29',
  '92.2.27',
  '76.2.26',
  '77.2.24',
  '78.1.20',
  '78.2.20',
  '64.1.19',
  '69.1.52',
  '69.2.16',
  '56.1.14',
  '57.1.13',
  '57.2.11',
  '57.3.10',
  '40.1.9',
  '41.1.95',
  '41.2.140',
  '17.2.141',
  '348.1.149',
  '348.2.151',
  '348.3.150',
  '349.1.154',
  '349.3.152',
  '350.1.158',
  '350.2.159',
  '350.3.314',
  '351.1.315',
  '351.2.316',
  '351.3.317',
  '352.1.318',
  '352.2.321',
  '352.3.322',
  '353.1.323',
  '353.2.324',
  '422.1.326',
  '422.2.328',
  '423.1.329',
  '423.2.330',
  '423.3.331',
  '424.1.332',
  '424.2.333',
  '424.3.334',
  '425.1.335']
print(len(detectors_sb_list))
detector_id_map = {}
data_unique_detect_ID = data['DetectorID'].unique()

count_loss = 0
for i in range(len(detectors_sb_list)):
    if detectors_sb_list[i] in data_unique_detect_ID:
        detector_id_map[detectors_sb_list[i]] = i - count_loss
    else:
        count_loss += 1
print(f'Detectors lost = {count_loss}')

data_detect_ID = data['DetectorID']

data_detect_new_ID = pd.Series([detector_id_map[i] for i in data_detect_ID])
print(data_detect_new_ID)
data['DetectID'] = data_detect_new_ID

In [None]:
data['DateTimeStamp'] = pd.to_datetime(data['DateTimeStamp'])
data = data.sort_values(by=['DateTimeStamp','DetectID'],ascending=[True, True])

date_time_obj = data['DateTimeStamp'].iloc[0]
print('Date:', date_time_obj.date())
print('Time:', date_time_obj.time())
print('Minute:', date_time_obj.time().minute)
print('Date-time:', date_time_obj)

print(data['DateTimeStamp'].unique())
print(len(data['DateTimeStamp'].unique()))

In [None]:
date_unq = pd.Series(data['DateTimeStamp'].unique())
date_rest = []
date_rare = []
date_no_15 = []
for i in range(len(date_unq)-1):
    if date_unq[i+1].time().hour == date_unq[i].time().hour:
        rest = date_unq[i+1].time().minute - date_unq[i].time().minute
    elif (date_unq[i+1].time().hour > date_unq[i].time().hour) or ((date_unq[i+1].time().hour == 0) and (date_unq[i].time().hour == 23)):
        rest = date_unq[i+1].time().minute + 60 - date_unq[i].time().minute
    else:
        rest = -1
        date_rare.append([date_unq[i+1], date_unq[i]])
        #print("algo raro")
    date_rest.append(rest)
    if rest != 15:
        date_no_15.append([rest, date_unq[i+1], date_unq[i]])

print(f'\nTotal DateTimes = {len(date_unq)} vs Total DateTimes in 2 years with a period of 15 min = {(60/15)*24*365*2} --> DateLoss = {100 - (100*(len(date_unq)/((60/15)*24*365*2)))}')
print(f'Periods other than 15 min = {len(date_no_15)}\n')

# Datos Raros
print(f'Very high periods (2h a 2 días) = {len(date_rare)}')
print(date_rare)

num = list(np.unique(date_rest))
count = np.zeros(len(num))
for i in date_no_15:
    if i[0] in num:
        count[num.index(i[0])] += 1
count[num.index(15)] = len(date_unq) - len(date_no_15)

fusion = []
for i in range(len(num)):
    fusion.append([num[i], count[i]])
print('\nPeriods vs number of samples')
print(fusion)

In [None]:
data = data.groupby(['DateTimeStamp','DetectID'], as_index=False).mean()

In [None]:
data.head(10)

In [None]:
data.describe()

In [None]:
data.describe(include=['datetime64[ns]', 'object'])

In [None]:
print(data.info())
print(f'Shape of the data = {data.shape}')
#print(data.columns)

In [None]:
print('NA Count:')
print(data.isna().sum())

In [None]:
print(f"Number of Valid Values: {len(data['Invalid'][data['Invalid'] == 0])} ({(len(data['Invalid'][data['Invalid'] == 0])/len(data))*100}%)")
print(f"Number of Invalid Values: {len(data['Invalid'][data['Invalid'] != 0])} ({(len(data['Invalid'][data['Invalid'] != 0])/len(data))*100}%)")
print(f"Number of Invalid = 1: {len(data['Invalid'][data['Invalid'] == 1])} ({(len(data['Invalid'][data['Invalid'] == 1])/len(data))*100}%)")
print(f"Number of Invalid = 2: {len(data['Invalid'][data['Invalid'] == 2])} ({(len(data['Invalid'][data['Invalid'] == 2])/len(data))*100}%)")
print(f"Number of Invalid = 3: {len(data['Invalid'][data['Invalid'] == 3])} ({(len(data['Invalid'][data['Invalid'] == 3])/len(data))*100}%)")

print(f"Number Unique of Invalid Values: {len(data['Invalid'].unique())}")
hist = data['Invalid'].hist(bins=len(data['Invalid'].unique()))

In [None]:
hist = data['Invalid'][data['Invalid'] != 0].hist(bins=len(data['Invalid'].unique()) - 1)

In [None]:
print(f"Number of Correct Values: {len(data['Failure'][data['Failure'] == 0])} ({(len(data['Failure'][data['Failure'] == 0])/len(data))*100}%)")
print(f"Number of Failure Values: {len(data['Failure'][data['Failure'] != 0])} ({(len(data['Failure'][data['Failure'] != 0])/len(data))*100}%)")

print(f"Number Unique of Failure Values: {len(data['Failure'].unique())}")

In [None]:
hist = data['DetectID'].hist(bins=len(data['DetectID'].unique()))

In [None]:
print('Samples by detector "Totals"')
data.groupby('DetectID')['DateTimeStamp'].count()

In [None]:
print('Samples by detector "Percentage"')
data.groupby('DetectID')['DateTimeStamp'].count()*100/63092
