# Prozessoptimierung

## Beispiel: Jitter von CD's und Disc's

In [1]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from patsy.contrasts import Treatment, Sum
from statsmodels.stats.anova import anova_lm
df = pd.DataFrame({"Aa": np.tile(np.tile([-1, 1], 4),2),
                   "Bb": np.tile(np.repeat([-1,1], [2,2]),4),
                   "Cc": np.tile(np.repeat([-1,1], [4,4]),2)},
                  dtype="category")
df["Yy"] = np.array([34,26,33,21,24,23,19,18,40,29,35,22,23,22,18,18])
print(df)

   Aa Bb Cc  Yy
0  -1 -1 -1  34
1   1 -1 -1  26
2  -1  1 -1  33
3   1  1 -1  21
4  -1 -1  1  24
5   1 -1  1  23
6  -1  1  1  19
7   1  1  1  18
8  -1 -1 -1  40
9   1 -1 -1  29
10 -1  1 -1  35
11  1  1 -1  22
12 -1 -1  1  23
13  1 -1  1  22
14 -1  1  1  18
15  1  1  1  18


In [2]:
fit = ols("Yy ~ C(Aa, Sum)*C(Bb, Sum)*C(Cc, Sum)", data=df).fit()
fit.params

Intercept                                             25.3125
C(Aa, Sum)[S.-1]                                       2.9375
C(Bb, Sum)[S.-1]                                       2.3125
C(Cc, Sum)[S.-1]                                       4.6875
C(Aa, Sum)[S.-1]:C(Bb, Sum)[S.-1]                     -0.3125
C(Aa, Sum)[S.-1]:C(Cc, Sum)[S.-1]                      2.5625
C(Bb, Sum)[S.-1]:C(Cc, Sum)[S.-1]                     -0.0625
C(Aa, Sum)[S.-1]:C(Bb, Sum)[S.-1]:C(Cc, Sum)[S.-1]    -0.4375
dtype: float64

In [3]:
df["Yy"] = np.array([34,26,33,21,24,23,19,18,40,29,35,22,23,22,18,18])
fit = ols("Yy ~ C(Aa, Sum)*C(Bb, Sum)*C(Cc, Sum)", data=df).fit()
anova_lm(fit)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
"C(Aa, Sum)",1.0,138.0625,138.0625,41.679245,0.000197
"C(Bb, Sum)",1.0,85.5625,85.5625,25.830189,0.00095
"C(Cc, Sum)",1.0,351.5625,351.5625,106.132075,7e-06
"C(Aa, Sum):C(Bb, Sum)",1.0,1.5625,1.5625,0.471698,0.51162
"C(Aa, Sum):C(Cc, Sum)",1.0,105.0625,105.0625,31.716981,0.000492
"C(Bb, Sum):C(Cc, Sum)",1.0,0.0625,0.0625,0.018868,0.89414
"C(Aa, Sum):C(Bb, Sum):C(Cc, Sum)",1.0,3.0625,3.0625,0.924528,0.364446
Residual,8.0,26.5,3.3125,,


Haupteﬀekte A, B und C und WWseﬀekt A : C signiﬁkant. (<0.05)

In [4]:
fit = ols("Yy ~ C(Aa, Sum)*C(Bb, Sum)*C(Cc, Sum)", data=df).fit()
fit.params

Intercept                                             25.3125
C(Aa, Sum)[S.-1]                                       2.9375
C(Bb, Sum)[S.-1]                                       2.3125
C(Cc, Sum)[S.-1]                                       4.6875
C(Aa, Sum)[S.-1]:C(Bb, Sum)[S.-1]                     -0.3125
C(Aa, Sum)[S.-1]:C(Cc, Sum)[S.-1]                      2.5625
C(Bb, Sum)[S.-1]:C(Cc, Sum)[S.-1]                     -0.0625
C(Aa, Sum)[S.-1]:C(Bb, Sum)[S.-1]:C(Cc, Sum)[S.-1]    -0.4375
dtype: float64

Haupteﬀekt von A auf Stufe „tief” ist also 2.9375  
Haupteﬀekt von A auf Stufe „hoch” ist −2.9375  

## Beispiel: Reaktionsanalyse

In [5]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from patsy.contrasts import Treatment, Sum
from statsmodels.stats.anova import anova_lm
reakt = pd.DataFrame({"x1": np.array([120,160,120,160,140,140]),
                      "x2": np.array([50,50,70,70,60,60]),
                      "y": np.array([52,62,60,70,63,65])})
fit = ols("y~x1+x2",data=reakt).fit()
fit.params

Intercept    3.00
x1           0.25
x2           0.40
dtype: float64

Mit kodierten Variablen:

In [6]:
reakt = pd.DataFrame({"x1": np.array([-1,1,-1,1,0,0]),
                      "x2": np.array([-1,-1,1,1,0,0]),
                      "y": np.array([52,62,60,70,63,65])})
fit = ols("y~x1+x2",data=reakt).fit()
fit.params

Intercept    62.0
x1            5.0
x2            4.0
dtype: float64

In [7]:
reakt = pd.DataFrame({"Zeit": np.array([80,80,100,100,90,90,76,104,90]),
                      "Temp": np.array([195,235,195,235,187,243,215,215,215]),
                      "y": np.array([78,76,72,75,74,76,77,72,80])})
fit = ols("y~Zeit+Temp+I(Temp*Temp)+I(Zeit*Zeit)+I(Zeit*Temp)",data=reakt).fit()
fit.params

Intercept        -277.587423
Zeit                3.229716
Temp                2.024201
I(Temp * Temp)     -0.005960
I(Zeit * Zeit)     -0.026390
I(Zeit * Temp)      0.006250
dtype: float64

In [8]:
# Bsp Mäuse

anti = pd.DataFrame({
    'Anzahl': np.tile([-1,1],8),
    'VolPrs': np.tile(np.repeat([-1,1],[2,2]),4),
    'Zeit': np.tile(np.repeat([-1,1],[4,4]),2),
    'RadDos': np.repeat([-1,1],[8,8]),
    'Entw': [-1,1,1,-1, 1,-1,-1,1, -1,1,1,-1, 1,-1,-1,1], 
    'Inj2': [-1,-1,1,1, 1,1,-1,-1, 1,1,-1,-1, -1,-1,1,1] 
}, dtype='category')

anti['Yy'] = np.array([70,150,34,32,137.5,56,123,225,50,2.7,1.2,12,90,2.1,4,15])

In [9]:
fit = ols('Yy~C(Zeit,Sum)+C(RadDos,Sum)+C(Entw,Sum)+C(Inj2,Sum)', data=anti).fit() 
print(anova_lm(fit))

                  df        sum_sq       mean_sq          F    PR(>F)
C(Zeit, Sum)     1.0   5651.280625   5651.280625   3.102899  0.105885
C(RadDos, Sum)   1.0  26446.890625  26446.890625  14.520961  0.002890
C(Entw, Sum)     1.0   5863.730625   5863.730625   3.219547  0.100263
C(Inj2, Sum)     1.0   7314.525625   7314.525625   4.016122  0.070317
Residual        11.0  20034.196875   1821.290625        NaN       NaN


# Bsp Süssgetränke

In [4]:
# gegeben

import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from patsy.contrasts import Treatment, Sum 
from statsmodels.stats.anova import anova_lm

df = pd.DataFrame({'Aa': np.tile(np.tile([-1, 1], 8),2), 
'Bb': np.tile(np.repeat([-1,1], [2,2]),8),
'Cc': np.tile(np.repeat([-1,1], [4,4]),4), 
'Dd': np.tile(np.repeat([-1,1], [8,8]),2)},
dtype='category')

df['Yy']=np.array([159,168,158,166,175,179,173,179,164,187,163,185,168,197,170,194,163,
175,163,168,178,183,168,182,159,189,159,191, 174, 199,174,198])

# print(df)

### Fragestellung 1) Gibt es Variablen, die nicht relevant für die Gesamtpunktzahl ist (ohne Interaktion)?

In [5]:
fit = ols("Yy ~ C(Aa, Sum)+C(Bb, Sum)+C(Cc, Sum)+C(Dd, Sum)", data=df).fit()
anova_lm(fit)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
"C(Aa, Sum)",1.0,2312.0,2312.0,71.290792,4.687651e-09
"C(Bb, Sum)",1.0,21.125,21.125,0.651392,0.4266685
"C(Cc, Sum)",1.0,946.125,946.125,29.173876,1.038119e-05
"C(Dd, Sum)",1.0,561.125,561.125,17.302355,0.0002897778
Residual,27.0,875.625,32.430556,,


Wir werten nur die Variabeln aus, dazu verwenden wir +  
Die Auswertung zeigt, dass Aa, Cc und Dd hoch signifikant sind. Bb scheint keinen signifikanten Einfluss zu haben. 

### Fragestellung 2) b) Gibt es statistisch signiﬁkante Interaktion? Können Sie diese begründen?

In [6]:
fit = ols('Yy ~ C(Aa, Sum)*C(Bb, Sum)*C(Cc, Sum)*C(Dd, Sum)', data=df).fit()
anova_lm(fit)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
"C(Aa, Sum)",1.0,2312.0,2312.0,241.7778,4.450669e-11
"C(Bb, Sum)",1.0,21.125,21.125,2.20915,0.1566329
"C(Cc, Sum)",1.0,946.125,946.125,98.94118,2.957845e-08
"C(Dd, Sum)",1.0,561.125,561.125,58.67974,9.692188e-07
"C(Aa, Sum):C(Bb, Sum)",1.0,0.125,0.125,0.0130719,0.9103968
"C(Aa, Sum):C(Cc, Sum)",1.0,3.125,3.125,0.3267974,0.5754947
"C(Bb, Sum):C(Cc, Sum)",1.0,0.5,0.5,0.05228758,0.8220265
"C(Aa, Sum):C(Dd, Sum)",1.0,666.125,666.125,69.66013,3.186635e-07
"C(Bb, Sum):C(Dd, Sum)",1.0,12.5,12.5,1.30719,0.2697232
"C(Cc, Sum):C(Dd, Sum)",1.0,12.5,12.5,1.30719,0.2697232


Wir werten nun die Interaktion aus, somit verwenden wir das * statt +.   
Die Variablen A (Art des Süssungsmittel) und C (Kohlensäuregehalt) zeigen eine hochsigniﬁkante Interaktion. Es scheint, dass eine Kombination von Süssungsmittel und Kohlensäuregehalt einen sehr grossen positiven Einﬂuss auf den Geschmack hat und eine andere einen sehr grossen negativen. 