In [42]:
import pandas as pd
from google.cloud import bigquery
from datetime import datetime, timedelta
import joblib
import numpy as np

In [43]:
project_id = "banded-setting-428309-q4"
dataset_id = "datos"

In [24]:
client = bigquery.Client(project='banded-setting-428309-q4')
one_week_ago = datetime.now() - timedelta(days=7)
    
query = f"""
SELECT
    FORMAT_TIMESTAMP('%Y-%m-%d', Timestamp) AS Day,
    FORMAT_TIMESTAMP('%H', Timestamp) AS Hour,
    CASE
        WHEN FORMAT_TIMESTAMP('%M', Timestamp) BETWEEN '00' AND '07' THEN '00'
        WHEN FORMAT_TIMESTAMP('%M', Timestamp) BETWEEN '08' AND '22' THEN '15'
        WHEN FORMAT_TIMESTAMP('%M', Timestamp) BETWEEN '23' AND '37' THEN '30'
        WHEN FORMAT_TIMESTAMP('%M', Timestamp) BETWEEN '38' AND '52' THEN '45'
        ELSE '00'
    END AS Minute,
    ct.descripcion,
    bd.Value
FROM `banded-setting-428309-q4.datos.bronze-data` bd
LEFT JOIN `banded-setting-428309-q4.datos.col-tag` ct on bd.Tag = ct.tag
WHERE DATE(Timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 140 DAY) AND CURRENT_DATE()
"""
    
query_job = client.query(query)
results = query_job.result()

data = []   
for row in results:     
    data.append(dict(row))
    
   
df = pd.DataFrame(data)

In [26]:
df.to_parquet('140_days_df.parquet')

In [44]:
df = pd.read_parquet('140_days_df.parquet')

In [45]:
df_max_values = df.groupby(["descripcion", "Day", "Hour", "Minute"]).agg({"Value": "max"}).reset_index()

df_max_values['dayhourminute'] = df_max_values['Day'] + ' ' + df_max_values['Hour'] + ':' + df_max_values['Minute']
df_unpivot = df_max_values.pivot_table(index="dayhourminute", columns="descripcion", values="Value", aggfunc="max").reset_index()

In [46]:
df_unpivot

descripcion,dayhourminute,% BOMBA ALIM P45 M2 GLUCOSA,% BOMBA ALIMENT COLUMNAS 353509,% BOMBA ALIMENTACION CUBA M7,% BOMBA ALIMENTACION P45 M2 H0,% REG. VF BOMBA ALIMENT. SWENSON,% VAR. BOMBA SDA. CUBETA CH,% VARIADOR BOMBA ALIM. C.V.,% VF BOMBA ALIMENTACION W7,% VF BOMBA SALIDA SWENSON A CCHH,...,VOL RECUPERACION ANION GRUPO H0,VOL. RECUP. ANIÓN GRUPO 4,VOL. RECUP. CATIÓN GRUPO 4,VOLUMEN CIP A EDAR,VOLUMEN CIP A PEQUEÑAS AGUAS,VOLUMEN CIP LAVADO AGUA,VOLUMEN LAVADO CIP,VÁLVULA REGULADORA NIVEL M10,W10 STATUS,W7 STATUS
0,2024-02-21 00:00,39.064949,,245.286682,5786.234863,67.380882,1.321786,60.302448,100.00000,,...,,,,,,,,,,
1,2024-02-21 00:15,38.904778,,,6412.371094,78.564133,10.185977,49.215141,,,...,,,,,,,,,,
2,2024-02-21 00:30,32.004215,,,5754.599609,70.098381,7.656950,33.675518,,,...,,,,,,,,,,
3,2024-02-21 00:45,32.850964,,,5684.577148,66.539200,2.305533,27.622612,,40.391308,...,,,,,,,,,,
4,2024-02-21 01:00,32.034214,,,5831.075195,1.978051,3.940078,58.604836,53.21212,,...,,,,,,,,,,200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11671,2024-06-21 14:45,,,,9499.379883,77.118904,23.564919,,,85.176178,...,,,,,,,,,,
11672,2024-06-21 15:00,,,,9277.718750,,8.422264,,,,...,,,,,,,,,,
11673,2024-06-21 15:15,0.000000,,,7372.398438,81.652275,20.846022,,,85.796364,...,,,,,,,,,,
11674,2024-06-21 15:30,87.305496,,,7776.293945,71.021614,22.952904,,,71.709259,...,,,,,,,,,,


In [47]:
columns_to_check = [
    "COR TITÁNIC AZÚCARES", 
    "COT TITÁNIC AZÚCARES NUEVO", 
    "COT AGUAS ÁCIDAS", 
    "COT AGUAS ÁCIDAS NUEVO"
]

conditions = pd.Series([False] * len(df_unpivot))
for col in columns_to_check:
    if col in df_unpivot.columns:
        conditions |= (df_unpivot[col] > 2500)

df_unpivot["flag"] = conditions.astype(int)

df_unpivot["dayhourminute"] = pd.to_datetime(df_unpivot["dayhourminute"])

df_unpivot.head()


descripcion,dayhourminute,% BOMBA ALIM P45 M2 GLUCOSA,% BOMBA ALIMENT COLUMNAS 353509,% BOMBA ALIMENTACION CUBA M7,% BOMBA ALIMENTACION P45 M2 H0,% REG. VF BOMBA ALIMENT. SWENSON,% VAR. BOMBA SDA. CUBETA CH,% VARIADOR BOMBA ALIM. C.V.,% VF BOMBA ALIMENTACION W7,% VF BOMBA SALIDA SWENSON A CCHH,...,VOL. RECUP. ANIÓN GRUPO 4,VOL. RECUP. CATIÓN GRUPO 4,VOLUMEN CIP A EDAR,VOLUMEN CIP A PEQUEÑAS AGUAS,VOLUMEN CIP LAVADO AGUA,VOLUMEN LAVADO CIP,VÁLVULA REGULADORA NIVEL M10,W10 STATUS,W7 STATUS,flag
0,2024-02-21 00:00:00,39.064949,,245.286682,5786.234863,67.380882,1.321786,60.302448,100.0,,...,,,,,,,,,,0
1,2024-02-21 00:15:00,38.904778,,,6412.371094,78.564133,10.185977,49.215141,,,...,,,,,,,,,,0
2,2024-02-21 00:30:00,32.004215,,,5754.599609,70.098381,7.65695,33.675518,,,...,,,,,,,,,,0
3,2024-02-21 00:45:00,32.850964,,,5684.577148,66.5392,2.305533,27.622612,,40.391308,...,,,,,,,,,,0
4,2024-02-21 01:00:00,32.034214,,,5831.075195,1.978051,3.940078,58.604836,53.21212,,...,,,,,,,,,200.0,0


In [48]:
one_hour_earlier = df_unpivot["dayhourminute"] - pd.Timedelta(hours=1)
df_unpivot["flag"] |= df_unpivot["dayhourminute"].isin(one_hour_earlier[conditions])

df_unpivot["flag"] = df_unpivot["flag"].astype(int)

df_unpivot.head()

descripcion,dayhourminute,% BOMBA ALIM P45 M2 GLUCOSA,% BOMBA ALIMENT COLUMNAS 353509,% BOMBA ALIMENTACION CUBA M7,% BOMBA ALIMENTACION P45 M2 H0,% REG. VF BOMBA ALIMENT. SWENSON,% VAR. BOMBA SDA. CUBETA CH,% VARIADOR BOMBA ALIM. C.V.,% VF BOMBA ALIMENTACION W7,% VF BOMBA SALIDA SWENSON A CCHH,...,VOL. RECUP. ANIÓN GRUPO 4,VOL. RECUP. CATIÓN GRUPO 4,VOLUMEN CIP A EDAR,VOLUMEN CIP A PEQUEÑAS AGUAS,VOLUMEN CIP LAVADO AGUA,VOLUMEN LAVADO CIP,VÁLVULA REGULADORA NIVEL M10,W10 STATUS,W7 STATUS,flag
0,2024-02-21 00:00:00,39.064949,,245.286682,5786.234863,67.380882,1.321786,60.302448,100.0,,...,,,,,,,,,,0
1,2024-02-21 00:15:00,38.904778,,,6412.371094,78.564133,10.185977,49.215141,,,...,,,,,,,,,,0
2,2024-02-21 00:30:00,32.004215,,,5754.599609,70.098381,7.65695,33.675518,,,...,,,,,,,,,,0
3,2024-02-21 00:45:00,32.850964,,,5684.577148,66.5392,2.305533,27.622612,,40.391308,...,,,,,,,,,,0
4,2024-02-21 01:00:00,32.034214,,,5831.075195,1.978051,3.940078,58.604836,53.21212,,...,,,,,,,,,200.0,0


In [49]:
col_drop = ['COT AGUAS ÁCIDAS NUEVO', 'COT AGUAS ÁCIDAS', 'COR TITÁNIC AZÚCARES', 'COT TITÁNIC AZÚCARES NUEVO','dayhourminute']  
df = df_unpivot.drop(columns=[col for col in col_drop if col in df_unpivot.columns])
df = df.fillna(0)

In [50]:
df

descripcion,% BOMBA ALIM P45 M2 GLUCOSA,% BOMBA ALIMENT COLUMNAS 353509,% BOMBA ALIMENTACION CUBA M7,% BOMBA ALIMENTACION P45 M2 H0,% REG. VF BOMBA ALIMENT. SWENSON,% VAR. BOMBA SDA. CUBETA CH,% VARIADOR BOMBA ALIM. C.V.,% VF BOMBA ALIMENTACION W7,% VF BOMBA SALIDA SWENSON A CCHH,BOMBA RECUPERACIÓN GRUPOS H0,...,VOL. RECUP. ANIÓN GRUPO 4,VOL. RECUP. CATIÓN GRUPO 4,VOLUMEN CIP A EDAR,VOLUMEN CIP A PEQUEÑAS AGUAS,VOLUMEN CIP LAVADO AGUA,VOLUMEN LAVADO CIP,VÁLVULA REGULADORA NIVEL M10,W10 STATUS,W7 STATUS,flag
0,39.064949,0.0,245.286682,5786.234863,67.380882,1.321786,60.302448,100.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,38.904778,0.0,0.000000,6412.371094,78.564133,10.185977,49.215141,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,32.004215,0.0,0.000000,5754.599609,70.098381,7.656950,33.675518,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,32.850964,0.0,0.000000,5684.577148,66.539200,2.305533,27.622612,0.00000,40.391308,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,32.034214,0.0,0.000000,5831.075195,1.978051,3.940078,58.604836,53.21212,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11671,0.000000,0.0,0.000000,9499.379883,77.118904,23.564919,0.000000,0.00000,85.176178,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
11672,0.000000,0.0,0.000000,9277.718750,0.000000,8.422264,0.000000,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11673,0.000000,0.0,0.000000,7372.398438,81.652275,20.846022,0.000000,0.00000,85.796364,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11674,87.305496,0.0,0.000000,7776.293945,71.021614,22.952904,0.000000,0.00000,71.709259,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [51]:
X = df.drop('flag', axis=1)
X = X.fillna(0)
X.head()

descripcion,% BOMBA ALIM P45 M2 GLUCOSA,% BOMBA ALIMENT COLUMNAS 353509,% BOMBA ALIMENTACION CUBA M7,% BOMBA ALIMENTACION P45 M2 H0,% REG. VF BOMBA ALIMENT. SWENSON,% VAR. BOMBA SDA. CUBETA CH,% VARIADOR BOMBA ALIM. C.V.,% VF BOMBA ALIMENTACION W7,% VF BOMBA SALIDA SWENSON A CCHH,BOMBA RECUPERACIÓN GRUPOS H0,...,VOL RECUPERACION ANION GRUPO H0,VOL. RECUP. ANIÓN GRUPO 4,VOL. RECUP. CATIÓN GRUPO 4,VOLUMEN CIP A EDAR,VOLUMEN CIP A PEQUEÑAS AGUAS,VOLUMEN CIP LAVADO AGUA,VOLUMEN LAVADO CIP,VÁLVULA REGULADORA NIVEL M10,W10 STATUS,W7 STATUS
0,39.064949,0.0,245.286682,5786.234863,67.380882,1.321786,60.302448,100.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.904778,0.0,0.0,6412.371094,78.564133,10.185977,49.215141,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32.004215,0.0,0.0,5754.599609,70.098381,7.65695,33.675518,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,32.850964,0.0,0.0,5684.577148,66.5392,2.305533,27.622612,0.0,40.391308,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,32.034214,0.0,0.0,5831.075195,1.978051,3.940078,58.604836,53.21212,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0


In [52]:
y = df['flag']
y.value_counts()

0    6440
1    5236
Name: flag, dtype: int64

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
model = LogisticRegression()

# Entrenar el modelo
model.fit(X_train_scaled, y_train)

# Hacer predicciones
y_pred = model.predict(X_test_scaled)

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(cm)
print('Classification Report:')
print(report)

Accuracy: 0.6712328767123288
Confusion Matrix:
[[1004  306]
 [ 462  564]]
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.77      0.72      1310
           1       0.65      0.55      0.59      1026

    accuracy                           0.67      2336
   macro avg       0.67      0.66      0.66      2336
weighted avg       0.67      0.67      0.67      2336



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
import joblib
model_filename = 'logistic_model.pkl'
joblib.dump(model, model_filename)
print(f'Model saved as {model_filename}')

Model saved as logistic_model.pkl


In [56]:
sclaer_filename = 'scaler_model.pkl'
joblib.dump(scaler, sclaer_filename)
print(f'Scaler saved as {sclaer_filename}')

Scaler saved as scaler_model.pkl
