#### Small demonstration of crosstab, correlations and phik-matrix with the steel data, 2025

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# pip install phik
import phik

In [3]:
df = pd.read_csv('../data/sulatto_fulldata2025_version2.csv', delimiter=";", index_col=None)

  df = pd.read_csv('../data/sulatto_fulldata2025_version2.csv', delimiter=";", index_col=None)


In [4]:
# these are the columns that have commas instead of dots in the numeric values
# we're gonna fix these with a for -loop below
fixables = ["Length", 
            "Thickness",
            'CCM.TD. Steel weight in tundish Value',
            'CCM.TD. Tundish inflow Value', 'CCM.TD. Stopper Last Position Value',
            'CCM.TD. Stopper position max Value',
            'CCM.TD. Stopper position min Value',
            'CCM.TD. Stopper pos chg max segment Value',
            'CCM.TD. Stopper position chg 10 s Value',
            'CCM.TD. Stopper position slab chg Value', 
            'CCM.MD. Level average Value',
            'CCM.MD. Level change (max-min) Value', 'CCM.MD. Level std_dev Value',
            'CCM.MD. Level std_dev west Value', 'CCM.MD. Level std_dev east Value',
            'CCM.MIX. Concentration factor Value',
            'CCM.STR. Speed - Mold avg Value', 'CCM.STR. Throughput Value']

In [5]:
# go through every column that has comma in the data
# replace comma with a dot, and convert to float
# apparently we have to use "regex" and "value" because
# we are trying to modify multiply columns at the same time
for column_name in fixables:
    df[column_name] = df[column_name].astype(str).replace(regex=",", value=".")
    df[column_name] = df[column_name].astype(float)

<b>FILTER BY ONE CAST (including n-amount of heats, which include m-amount of slabs)</b>

In [6]:
# filter by one cast. possible casts are 100 - 248
# example of a good cast => 100
# example of a bad cast => 125
df_single = df[df['CastNo'] == 125]

unique_heats = list(df_single['HeatNo_in_Cast'].unique())

# we're going to imitate this dictionary structure
# and replace the original heat numbers with easier numbering
# like 1, 2, 3, 4 etc.
replacement_dict = {}

for heat in unique_heats:
    index = unique_heats.index(heat)
    replacement_dict[heat] = index + 1

replacement_dict

df_single["HeatNo_in_Cast"] = df_single["HeatNo_in_Cast"].replace(replacement_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single["HeatNo_in_Cast"] = df_single["HeatNo_in_Cast"].replace(replacement_dict)


In [7]:
# let's build a new process number with this logic:
# cast number -> heat number -> slab number -> segment number
df_single['ProcessNo'] = df_single['HeatNo_in_Cast'].astype(str) + \
                         df_single['SlabNo_in_Heat'].astype(str).str.zfill(2) + \
                         df_single['Seg No'].astype(str).str.zfill(2)

# df_single['ProcessNo'] = df_single['ProcessNo'].astype(int)

df_single = df_single.reset_index()
df_single["indexno"] = df_single.index + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single['ProcessNo'] = df_single['HeatNo_in_Cast'].astype(str) + \


In [8]:
df.head(3)

Unnamed: 0,Material Type,End time,Route,Weight,Length,Thickness,Width,Grade,Quality Status,Seg No,...,CCM.MD. Level std_dev east Value,CCM.MD. Mold width Value,CCM.MIX. Concentration factor Value,CCM.STR. Speed - Mold avg Value,CCM.STR. Throughput Value,CCM.SLAB. Slab counter heat Value,CCM.SLAB. Slab counter sequence Value,HeatNo_in_Cast,SlabNo_in_Heat,CastNo
0,slab,2023-02-06 01:05:11,Route 1,25783,10.63,202.0,1539,720-1,ERROR,1,...,12.5,1539.0,0.0,0.5,1.18,1.0,1.0,2821,1,100.0
1,slab,2023-02-06 01:05:11,Route 1,25783,10.63,202.0,1539,720-1,ERROR,2,...,12.5,1539.0,0.0,0.54,1.28,1.0,1.0,2821,1,100.0
2,slab,2023-02-06 01:05:11,Route 1,25783,10.63,202.0,1539,720-1,ERROR,3,...,12.5,1539.0,0.0,0.56,1.33,1.0,1.0,2821,1,100.0


In [None]:
# cleaning up a few of the unneeded columns
droppables = ['Material Type', 'Route', 'Seg from', 'Seg to', 'CCM.TD. TD internals type Value', 'Width', 'Thickness', 'Length']
df = df.drop(droppables, axis=1)

KeyError: "['Route', 'Seg from', 'Seg to', 'CCM.TD. TD internals type Value'] not found in axis"

#### Cross tabulation, how are Stopper types and quality status distributed

In [10]:
# replace stopper types with integers in order to calculate correlations/phik-matrix
df['CCM.TD. Stopper Type Value'] = df['CCM.TD. Stopper Type Value'].map({'T1': 0, 'T2': 1})

In [11]:
# Crosstab of sum of values
crosstab_result = pd.crosstab(df['Quality Status'], df['CCM.TD. Stopper Type Value'])

# calculate the percentage (in full %-units) related to amount of ERROR/OK/WARNING related to total proportion of the particular stopper type
# this code is far from optimal or dynamic, but it "gets the job done"
crosstab_percentages = {
    "ERROR-T1": crosstab_result.at['ERROR', 0.0] / df[df['CCM.TD. Stopper Type Value'] == 0.0].count()['CCM.TD. Stopper Type Value'],
    "ERROR-T2": crosstab_result.at['ERROR', 1.0] / df[df['CCM.TD. Stopper Type Value'] == 01.0].count()['CCM.TD. Stopper Type Value'],
    "OK-T1": crosstab_result.at['OK', 0.0] / df[df['CCM.TD. Stopper Type Value'] == 0.0].count()['CCM.TD. Stopper Type Value'],
    "OK-T2": crosstab_result.at['OK', 1.0] / df[df['CCM.TD. Stopper Type Value'] == 01.0].count()['CCM.TD. Stopper Type Value'],
    "WARNING-T1": crosstab_result.at['WARNING', 0.0] / df[df['CCM.TD. Stopper Type Value'] == 0.0].count()['CCM.TD. Stopper Type Value'],
    "WARNING-T2": crosstab_result.at['WARNING', 1.0] / df[df['CCM.TD. Stopper Type Value'] == 01.0].count()['CCM.TD. Stopper Type Value']   
}

# let's print out the calculations
crosstab_percentages_df = pd.Series(crosstab_percentages).to_frame()
crosstab_percentages_df = crosstab_percentages_df * 100
crosstab_percentages_df

Unnamed: 0,0
ERROR-T1,6.612112
ERROR-T2,5.084022
OK-T1,78.085902
OK-T2,79.610269
WARNING-T1,15.301986
WARNING-T2,15.305709


#### Correlations and phik-matrix -values

In [12]:
df[['CCM.TD. Stopper Last Position Value', 'CCM.TD. Stopper Type Value']].corr(numeric_only=True)

Unnamed: 0,CCM.TD. Stopper Last Position Value,CCM.TD. Stopper Type Value
CCM.TD. Stopper Last Position Value,1.0,-0.022818
CCM.TD. Stopper Type Value,-0.022818,1.0


In [13]:
df[['CCM.TD. Stopper Last Position Value', 'CCM.TD. Stopper Type Value']].phik_matrix()

interval columns not set, guessing: ['CCM.TD. Stopper Last Position Value', 'CCM.TD. Stopper Type Value']


Unnamed: 0,CCM.TD. Stopper Last Position Value,CCM.TD. Stopper Type Value
CCM.TD. Stopper Last Position Value,1.0,0.21062
CCM.TD. Stopper Type Value,0.21062,1.0


In [14]:
df[['CCM.TD. Tundish inflow Value', 'CCM.TD. Stopper Type Value']].corr(numeric_only=True)

Unnamed: 0,CCM.TD. Tundish inflow Value,CCM.TD. Stopper Type Value
CCM.TD. Tundish inflow Value,1.0,-0.003713
CCM.TD. Stopper Type Value,-0.003713,1.0


In [15]:
df[['CCM.TD. Tundish inflow Value', 'CCM.TD. Stopper Type Value']].phik_matrix()

interval columns not set, guessing: ['CCM.TD. Tundish inflow Value', 'CCM.TD. Stopper Type Value']


Unnamed: 0,CCM.TD. Tundish inflow Value,CCM.TD. Stopper Type Value
CCM.TD. Tundish inflow Value,1.0,0.0
CCM.TD. Stopper Type Value,0.0,1.0


In [16]:
df[['CCM.TD. Steel weight in tundish Value', 'CCM.TD. Stopper Type Value']].phik_matrix()

interval columns not set, guessing: ['CCM.TD. Steel weight in tundish Value', 'CCM.TD. Stopper Type Value']


Unnamed: 0,CCM.TD. Steel weight in tundish Value,CCM.TD. Stopper Type Value
CCM.TD. Steel weight in tundish Value,1.0,0.082166
CCM.TD. Stopper Type Value,0.082166,1.0


In [17]:
df[['CCM.TD. Steel weight in tundish Value', 'CCM.TD. Stopper Type Value']].corr(numeric_only=True)

Unnamed: 0,CCM.TD. Steel weight in tundish Value,CCM.TD. Stopper Type Value
CCM.TD. Steel weight in tundish Value,1.0,0.01584
CCM.TD. Stopper Type Value,0.01584,1.0
