# Aufgabe 2 Wein

## Data Cleaning

In [15]:
%pip install regex
import pandas as pd
import numpy as np
import regex as re



Note: you may need to restart the kernel to use updated packages.


In [16]:
df = pd.read_csv('wein.csv')
df = df.dropna()
df = df.drop_duplicates()
# convert all columns to strings to handle leading zeros and punctuation
df = df.applymap(str)
# drop the punct at the end of the string/number
df = df.applymap(lambda x: x.rstrip('.') if isinstance(x, str) else x)
# drop the 0 in the beginning of the string/number
df = df.applymap(lambda x: re.sub(r'^0+(?=\d)','',x) if isinstance(x, str) else x)
# convert columns back to appropriate numeric types
df = df.apply(pd.to_numeric, errors='coerce')
print(df.dtypes)
df
#df.to_csv('wein_clean.csv', index=False)
df.head()



Alkohol                    float64
Apfelsaeure                float64
Asche                      float64
Aschen_Alkanitaet          float64
Magnesium                    int64
Alle_Phenole               float64
Flavanoide                 float64
Nichtflavanoide_Phenole    float64
Proanthocyanide            float64
Farbintensitaet            float64
Farbwert                   float64
Proteinwert                float64
Prolinwert                   int64
dtype: object



DataFrame.applymap has been deprecated. Use DataFrame.map instead.


DataFrame.applymap has been deprecated. Use DataFrame.map instead.


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



Unnamed: 0,Alkohol,Apfelsaeure,Asche,Aschen_Alkanitaet,Magnesium,Alle_Phenole,Flavanoide,Nichtflavanoide_Phenole,Proanthocyanide,Farbintensitaet,Farbwert,Proteinwert,Prolinwert
0,14.23,1.71,2.43,15.06,127,2.08,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.02,1.78,2.14,11.02,100,2.65,2.76,0.26,1.28,4.38,1.05,3.04,1050
2,13.16,2.36,2.67,18.06,101,2.08,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.05,16.08,113,3.85,3.49,0.24,2.18,7.08,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.08,2.69,0.39,1.82,4.32,1.04,2.93,735


In [17]:
#df.info()
#df.describe()

## Linear Regression

### Quality values of the regression (R^2 score)
The R² score, also known as the coefficient of determination, is a statistical measure that indicates how well the independent variables explain the variability of the dependent variable in a regression model. It provides an indication of the goodness of fit of the model.  
- R² = 1: Indicates that the regression model perfectly explains the variability of the dependent variable.
- R² = 0: Indicates that the regression model does not explain any of the variability of the dependent variable.
- R² < 0: In some cases, R² can be negative, indicating that the model is worse than a horizontal line (mean of the dependent variable).

In [18]:
#sns.pairplot(df, kind = 'scatter')

In [19]:
#sns.lmplot(x='Proanthocyanide', y='Flavanoide', data =df, scatter_kws={'alpha':0.3})

In [None]:
%pip install scikit-learn dash plotly

import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from dash import Dash, dcc, html, Input, Output
import plotly.express as px

data = df

# Regressionen berechnen
def calculate_regressions(data):
    '''
    for each pair of columns in the data, calculate the R² of a linear regression
    param data: a pandas DataFrame
    return: a pandas DataFrame with columns 'X', 'Y', 'R²'
    '''
    dic_list = [
        {'X': x, 'Y': y, 'R^2': r2_score(data[y], LinearRegression().fit(data[[x]], data[y]).predict(data[[x]]))}
        for x, y in combinations(data.columns, 2)
    ]
    return pd.DataFrame(dic_list)

regression_results = calculate_regressions(data)

# Dash App 
app = Dash(__name__)
app.layout = html.Div([
    html.H1("Lineare Regression", style={'textAlign': 'center'}),
    
    # Dropdown für die Auswahl von R^2
    dcc.Dropdown(
        id='r2-threshold-dropdown',
        options=[
            {'label': 'R² > 0.1', 'value': 0.1},
            {'label': 'R² > 0.3', 'value': 0.3},
            {'label': 'R² > 0.5', 'value': 0.5},
            {'label': 'R² > 0.7', 'value': 0.7}, 
        ],
        value=0.3,  
        style={'width': '50%'}
    ),
    
    # Dropdown for selecting regression pairs
    dcc.Dropdown(id='xy-pairs',value='Alle_Phenole,Flavanoide',placeholder="Wähle ein Paar"),
    
    # Graph für die  Regression
    dcc.Graph(id='regression-plot'),
    
    html.Div(id='regression-stats')
])


@app.callback(
    [Output('xy-pairs', 'options'), Output('regression-plot', 'figure'), Output('regression-stats', 'children')],
    Input('r2-threshold-dropdown', 'value'),
    Input('xy-pairs', 'value')
)
def update_content(r2_threshold, selected_pair):
    # Filter für den Dropdown nach R²
    filtered = regression_results[regression_results['R^2'] > r2_threshold]
    options = [{'label': f"{row['X']} und {row['Y']}", 'value': f"{row['X']},{row['Y']}"} for _, row in filtered.iterrows()]

    #default_option = {'label': 'Alle_Phenole und Flavonoide', 'value': 'Alle_Phenole,Flavanoide'}

    if not selected_pair:
        return options, {}, "ein Paar auswählen."
    
    x, y = selected_pair.split(',')
    X, y_vals = data[[x]].values, data[y].values
    model = LinearRegression().fit(X, y_vals)
    r2, slope, intercept = r2_score(y_vals, model.predict(X)), model.coef_[0], model.intercept_

    # Plot erstellen
    fig = px.scatter(x=X.flatten(), y=y_vals, labels={'x': x, 'y': y})
    fig.add_scatter(x=X.flatten(), y=model.predict(X), mode='lines', name='Regressionslinie')

    # Statistiken
    stats = f"R²: {r2:.4f}, Steigung: {slope:.4f}, Achsenabschnitt: {intercept:.4f}"
    return options, fig, stats


if __name__ == '__main__':
    app.run_server(debug=True)





Note: you may need to restart the kernel to use updated packages.
