In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('lista_zawiadomień.csv', encoding='utf-8-sig', sep=";")

# Handle missing values in Nr zlec., Opis i Priorytet
df['Nr zlec.'] = df['Nr zlec.'].fillna('Brak danych')
df['Opis'] = df['Opis'].fillna('Brak opisu')
df['Priorytet'] = df['Priorytet'].fillna('Nieokreślony')

# Delete rows with NaN values in Oznaczenie (name of the machine)
df = df.dropna(subset=['Oznaczenie'])

# Delete not needed columns: Utworzone przez, Opis, Status systemu and Nr zlec.
df = df.drop(columns=['Utworzone przez', 'Opis', 'Status systemu', "Nr zlec."])

# Change type of the column Data zawiadom. to date
df['Data zawiadom.'] = pd.to_datetime(df['Data zawiadom.'], format='%d.%m.%Y', errors='coerce')

df.to_csv("cleaned_lista_zawiadomień.csv", index=False, encoding="utf-8")
df

Unnamed: 0,Rodzaj zawiad.,Zawiadomienie,Data zawiadom.,Oznaczenie,Priorytet
0,1P,12708097,2025-06-30,1-Zespół pomp próżniowych BUSCH,4-niski
1,1P,12707957,2025-06-30,1-Trans. Sandwick za walcami 64M,Nieokreślony
2,1P,12707755,2025-06-30,1-Zbiornik olejowy 1m^3,Nieokreślony
3,1P,12707745,2025-06-30,1-Piec wafla górnego KB2,Nieokreślony
4,PQ,12707726,2025-06-30,1-Installations Other,3-średni
...,...,...,...,...,...
160648,PM,10143679,2014-04-01,2-Kinder Joy lini-Nalewarka goccia,Nieokreślony
160649,PM,10143678,2014-04-01,1-Kinder Joy lini-Modelator do K Joya,Nieokreślony
160650,PM,10143666,2014-04-01,1-Linia pakowania IFCR,Nieokreślony
160652,PM,10143508,2014-04-01,1-Tanks,Nieokreślony


In [11]:
import json, re
from pathlib import Path

def hierarchy_to_json(text: str,
                       out_path: str = "hierarchy.json",
                       key_re=r"(\d{6,8})|PLPA-[\w\-]+",
                       value_re=r"[A-ZŻŹĆĄŚĘŁÓŃ0-9].*?(\s{2,}|$)",
                       strip_columns=True):

    lines = text.splitlines()

    def indent_width(line):
        trimmed = re.sub(r"^[\s\|\-\+]*", "", line)
        return len(line) - len(trimmed)

    root = {}                 # ---- fixed: create first
    stack = [(-1, root)]      # ---- and then put on the stack

    for raw in lines:
        if not raw.strip():
            continue
        indent = indent_width(raw)

        key_match = re.search(key_re, raw)
        if not key_match:
            continue
        key = key_match.group(0)

        value_part = raw.split(key, 1)[-1]
        value_match = re.search(value_re, value_part)
        value = value_match.group(0).strip() if value_match else ""

        if strip_columns:
            value = re.sub(r"\s{2,}", " ", value)

        while stack and indent <= stack[-1][0]:
            stack.pop()

        parent = stack[-1][1]
        parent[key] = {"description": value, "children": {}}
        stack.append((indent, parent[key]["children"]))

    def squash(node):
        """
        Recursively turn the internal representation into the final JSON structure,
        retaining both the description and any children.
        """
        out = {"_desc": node["description"]}
        if node["children"]:
            out["children"] = {k: squash(v) for k, v in node["children"].items()}
        return out


    json_tree = {k: squash(v) for k, v in root.items()}

    Path(out_path).write_text(
        json.dumps(json_tree, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print(f"JSON saved → {out_path}")
    return json_tree

with open("urządzenie_struktura.txt", encoding="utf-8") as f:
    txt = f.read()

# nested = hierarchy_to_json(txt, "urządzenia_struktura.json")

In [12]:
import pandas as pd, json, re

df = pd.read_csv("cleaned_lista_zawiadomień.csv", encoding="utf-8")
with open("urządzenia_struktura.json", encoding="utf-8") as f:
    tree = json.load(f)


def flatten(node, *, parent_1_desc=None,
            linia_ogolna=None,  linia_ogolna_desc=None,
            linia_szczeg=None, linia_szczeg_desc=None):
    result = {}

    for key, value in node.items():
        desc = value.get("_desc", "").strip()

        # ── production line codes & names ─────────────────────────
        if '-' in key:
            dash_cnt = key.count('-')
            if dash_cnt == 3:                       # e.g. PLPA-PR-U11-010703
                linia_ogolna      = key
                linia_ogolna_desc = desc
            elif dash_cnt == 4:                     # e.g. PLPA-PR-U11-010703-001
                linia_szczeg      = key
                linia_szczeg_desc = desc

        # ── top-level “1- …” asset ───────────────────────────────
        if desc.startswith("1-"):
            parent_1_desc = desc

        # store current node
        if desc:
            result[desc] = {
                "ozn": parent_1_desc,
                "linia_ogolna":      linia_ogolna,
                "linia_ogolna_desc": linia_ogolna_desc,
                "linia_szczeg":      linia_szczeg,
                "linia_szczeg_desc": linia_szczeg_desc,
                "id": key
            }

        # recurse
        children = value.get("children", {})
        result.update(
            flatten(children,
                    parent_1_desc=parent_1_desc,
                    linia_ogolna=linia_ogolna,      linia_ogolna_desc=linia_ogolna_desc,
                    linia_szczeg=linia_szczeg,      linia_szczeg_desc=linia_szczeg_desc)
        )
    return result

lookup = flatten(tree)

cols = ["First Level Ozn",
        "Linia ogólna", "Linia ogólna nazwa",
        "Linia szczegółowa", "Linia szczegółowa nazwa"]

def match(row):
    m = lookup.get(str(row["Oznaczenie"]).strip())
    if m:
        return pd.Series([
            m["ozn"],
            m["linia_ogolna"],      m["linia_ogolna_desc"],
            m["linia_szczeg"],      m["linia_szczeg_desc"]
        ])
    return pd.Series([pd.NA] * len(cols))

df[cols] = df.apply(match, axis=1)


In [13]:
df = df.dropna(subset=cols, how="all").reset_index(drop=True)
df.to_csv("zgloszenia_with_hierarchy.csv", index=False)

In [14]:
na_counts = df[cols].isna().sum()        
print(na_counts)

First Level Ozn             8195
Linia ogólna                   0
Linia ogólna nazwa             0
Linia szczegółowa          18174
Linia szczegółowa nazwa    18174
dtype: int64


In [15]:
unique_counts = df[cols].nunique(dropna=True) 
print(unique_counts)

First Level Ozn            1807
Linia ogólna                 81
Linia ogólna nazwa           81
Linia szczegółowa           106
Linia szczegółowa nazwa     106
dtype: int64


In [16]:
df = df[(df["Rodzaj zawiad."] == "1P") | (df["Rodzaj zawiad."] == "PM")]
df.to_csv("zgloszenia_with_hierarchy.csv", index=False)
df

Unnamed: 0,Rodzaj zawiad.,Zawiadomienie,Data zawiadom.,Oznaczenie,Priorytet,First Level Ozn,Linia ogólna,Linia ogólna nazwa,Linia szczegółowa,Linia szczegółowa nazwa
0,1P,12708097,2025-06-30,1-Zespół pomp próżniowych BUSCH,4-niski,1-Zespół pomp próżniowych BUSCH,PLPA-PR-U12-010801,KINDER CARDS,PLPA-PR-U12-010801-002,KINDER CARDS MODELATOR
1,1P,12707957,2025-06-30,1-Trans. Sandwick za walcami 64M,Nieokreślony,1-Trans. Sandwick za walcami 64M,PLPA-PR-U32-012110,Prep2 Krem KB,,
2,1P,12707755,2025-06-30,1-Zbiornik olejowy 1m^3,Nieokreślony,1-Zbiornik olejowy 1m^3,PLPA-PR-U32-012211,Ciemny krem (K CARDS/KJ),,
3,1P,12707745,2025-06-30,1-Piec wafla górnego KB2,Nieokreślony,1-Piec wafla górnego KB2,PLPA-PR-U43-011445,KB 2.,PLPA-PR-U43-011445-001,KB 2. Piece z nawilżaczami
5,1P,12707707,2025-06-30,1-Silos Cukier S-3 Nowy,Nieokreślony,1-Silos Cukier S-3 Nowy,PLPA-PR-U32-012011,PREP. Kremu KB,PLPA-PR-U32-012011-001,Preparacja kremu KJ/KB/KBW
...,...,...,...,...,...,...,...,...,...,...
123450,PM,10143837,2014-04-02,1-Łukowy przechładzacz wafla 1 RAF ARKA,Nieokreślony,1-Łukowy przechładzacz wafla 1 RAF ARKA,PLPA-PR-U11-010703,RAFFAELLO LINIA 2,PLPA-PR-U11-010703-001,PIECE Z NAWILŻACZAMI RAFFAELLO
123451,PM,10143754,2014-04-01,1- Maszyna termoformująca TFT2,Nieokreślony,1- Maszyna termoformująca TFT2,PLPA-PR-U22-019329,KJ TERMOFORMATURA,PLPA-PR-U22-019329-001,KJ TERMOFORMOWANIE
123452,PM,10143739,2014-04-01,1-Piec do wypieku wafla RAF 1,Nieokreślony,1-Piec do wypieku wafla RAF 1,PLPA-PR-U11-010703,RAFFAELLO LINIA 2,PLPA-PR-U11-010703-001,PIECE Z NAWILŻACZAMI RAFFAELLO
123453,PM,10143720,2014-04-01,1-Drukarki,Nieokreślony,1-Drukarki,PLPA-PR-U43-011444,KB / KBW Isola Multipack,,


In [17]:
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path

DATA_PATH = Path("zgloszenia_with_hierarchy.csv")   
df = (
    pd.read_csv(DATA_PATH)
      .assign(Data_zawiadom=pd.to_datetime(
              pd.to_datetime(df["Data zawiadom."], errors="coerce").dt.date))
)

df = df.rename(columns={
    "Linia szczegółowa nazwa": "Linia_nazwa_szczeg",
    "Linia ogólna nazwa":      "Linia_nazwa_ogólna"
})

df["YearMonth"] = df["Data_zawiadom"].dt.to_period("M").astype(str)
metric = (
    df.groupby(["Linia_nazwa_szczeg", "YearMonth"])
      .size().rename("Count")
      .reset_index()
)

def make_trace(line_name):
    tmp = metric.loc[metric["Linia_nazwa_szczeg"].eq(line_name)]
    return go.Bar(
        x=tmp["YearMonth"],
        y=tmp["Count"],
        name=line_name,
        visible=False,     
    )

all_lines = sorted(metric["Linia_nazwa_szczeg"].dropna().unique())

traces = [make_trace(l) for l in all_lines]
traces[0].visible = True

buttons = [
    dict(
        label=line,
        method="update",
        args=[{"visible": [i == k for i in range(len(traces))]},
              {"title": f"Liczba zgłoszeń – {line}"}]
    )
    for k, line in enumerate(all_lines)
]

fig = go.Figure(
    data=traces,
    layout=dict(
        title=f"Liczba zgłoszeń – {all_lines[0]}",
        xaxis_title="Rok-miesiąc",
        yaxis_title="Liczba zgłoszeń",
        updatemenus=[dict(
            active=0,
            buttons=buttons,
            bgcolor="white"
        )]
    )
)
fig.update_layout(barmode="group", height=500, width=950)
fig.show()

In [18]:
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [19]:
# -------------------- 0  Imports & paths --------------------------
import pandas as pd, numpy as np, lightgbm as lgb
from pathlib import Path
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import HashingVectorizer
from scipy import sparse

DATA   = Path("zgloszenia_with_hierarchy.csv")
HORIZON = 1     
FREQ    = "D"          
SEED    = 42

# -------------------- 1  Load & quick clean -----------------------
raw = (pd.read_csv(DATA, parse_dates=["Data zawiadom."])
         .rename(columns={"Data zawiadom."       : "date",
                          "Linia szczegółowa"    : "line_id",
                          "Linia szczegółowa nazwa": "line_desc",
                          "Linia ogólna"         : "line_id_og",
                          "Linia ogólna nazwa"   : "line_desc_og"}))

# keep only tickets that belong to a specific line (drop NaNs)
raw = raw.dropna(subset=["line_id"])

# -------------------- 2  Build the rolling calendar ---------------
awarie = raw.loc[raw["Rodzaj zawiad."].str.upper() == "1P",
                 ["line_id", "date"]].sort_values(["line_id","date"])
awarie["next_awaria"] = awarie.groupby("line_id")["date"].shift(-1)

# calendar per line
cal_parts = []
for line, grp in awarie.groupby("line_id"):
    rng = pd.date_range(grp["date"].min(), grp["date"].max(), freq=FREQ)
    cal_parts.append(pd.DataFrame({"line_id": line, "snap_date": rng}))
calendar = pd.concat(cal_parts, ignore_index=True)

calendar = calendar.merge(awarie,
                          left_on=["line_id","snap_date"],
                          right_on=["line_id","date"],
                          how="left").drop(columns="date")
calendar["next_awaria"] = (
    calendar.groupby("line_id")["next_awaria"].fillna(method="bfill")
)
calendar = calendar[calendar["next_awaria"].notna()]

calendar["days_until_next"] = (
    (calendar["next_awaria"] - calendar["snap_date"]).dt.days
)
calendar["y"] = (calendar["days_until_next"] <= HORIZON).astype(int)

# -------------------- 3  Feature engineering ----------------------
feat = calendar.copy()

# recency (= time since previous awaria)
feat["days_since_prev"] = (
    feat.groupby("line_id")["snap_date"].diff().dt.days.fillna(9999)
)

# seasonality
feat["month"]   = feat["snap_date"].dt.month.astype("int8")
feat["weekday"] = feat["snap_date"].dt.weekday.astype("int8")

# merge static descriptors
static = raw[["line_id","line_desc","line_id_og","line_desc_og"]].drop_duplicates("line_id")
feat = feat.merge(static, on="line_id", how="left")

# -------------------- 4  Vectorise text → sparse matrix -----------

text_cols  = ["line_desc", "line_desc_og"]
num_cols   = ["days_since_prev", "month", "weekday"]

vec = HashingVectorizer(n_features=2**18,
                        norm=None, alternate_sign=False,
                        token_pattern=r"(?u)\b\w+\b")

text_sparse = vec.transform(
    feat[text_cols].fillna("").agg(" ".join, axis=1)
)

X_num = feat[num_cols].values.astype("float32")
X = sparse.hstack([text_sparse, X_num], format="csr")
y = feat["y"].values.astype("int8")

# -------------------- 5  Train / valid split & LightGBM ----------
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

params = dict(objective="binary",
              metric="auc",
              learning_rate=0.05,
              num_leaves=128,
              feature_fraction=0.8,
              bagging_fraction=0.8,
              bagging_freq=5,
              seed=SEED,
              n_jobs=-1)

dtrain = lgb.Dataset(X_tr, y_tr)
dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)

gbm = lgb.train(params,
                dtrain,
                num_boost_round=2000,
                valid_sets=[dvalid],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=200),
                    lgb.log_evaluation(period=250)
                ])

print("\nValidation AUC:",
      roc_auc_score(y_val, gbm.predict(X_val, num_iteration=gbm.best_iteration)))
print("Validation AP :", 
      average_precision_score(y_val, gbm.predict(X_val, num_iteration=gbm.best_iteration)))

# -------------------- 6  Helper: predict probability --------------
def prob_awaria(line_id, today, horizon=HORIZON):
    """Return P(awaria in [today, today+horizon]) for one line."""
    today = pd.to_datetime(today).normalize()

    last = feat.loc[
        (feat["line_id"] == line_id) &
        (feat["snap_date"] <= today)
    ].sort_values("snap_date").tail(1)
    if last.empty:
        return np.nan

    Xq = sparse.hstack([
        vec.transform(last[text_cols].fillna("").agg(" ".join, axis=1)),
        last[num_cols].values.astype("float32")
    ], format="csr")

    return float(gbm.predict(Xq, num_iteration=gbm.best_iteration))

# example ----------------------------------------------------------
example_line = "PLPA-PR-U12-010801-002"
print(f"\nP(awaria ≤ {HORIZON} d) for {example_line} on 2025-07-10 : "
      f"{prob_awaria(example_line,'2025-07-10'):.2%}")



SeriesGroupBy.fillna is deprecated and will be removed in a future version. Use obj.ffill() or obj.bfill() for forward or backward filling instead. If you want to fill with a single value, use Series.fillna instead


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



[LightGBM] [Info] Number of positive: 42093, number of negative: 80104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 279
[LightGBM] [Info] Number of data points in the train set: 122197, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344468 -> initscore=-0.643444
[LightGBM] [Info] Start training from score -0.643444
Training until validation scores don't improve for 200 rounds
[250]	valid_0's auc: 0.898
Early stopping, best iteration is:
[120]	valid_0's auc: 0.898647

Validation AUC: 0.8986474479162339
Validation AP : 0.8418597208337253

P(awaria ≤ 1 d) for PLPA-PR-U12-010801-002 on 2025-07-10 : 75.26%



Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

