In [268]:
import dash
from dash import html, dcc
import dash_bootstrap_components as dbc
import ollama
import plotly.express as px
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import io
import base64
import json
import numpy as np
import shutil
import os
from io import StringIO

## loading metadata files

In [269]:
assets_folder = "metadata"

json_file_path_user_ip = os.path.join(assets_folder, "user_inputs.json")
with open(json_file_path_user_ip, 'r') as f:
    user_inputs = json.load(f)

json_file_path_user_dtype = os.path.join(assets_folder, "user_data_types.json")
with open(json_file_path_user_dtype, 'r') as f:
    user_data_types = json.load(f)

json_file_path_prepocess_report_ip = os.path.join(assets_folder, "preprocessing_report_inputs.json")
with open(json_file_path_prepocess_report_ip,'r') as f:
    preprocessing_report_inputs = json.load(f)

json_file_path_prepocess_ip = os.path.join(assets_folder, "preprocessing_inputs.json")    
with open(json_file_path_prepocess_ip,'r') as f:
    preprocessing_inputs = json.load(f)

## Meta data for report

In [270]:
user_inputs

{'Target Column': 'Class variable',
 'Remove Features': 'None',
 'Ordinal Degree': [],
 'Binning Columns': 'None',
 'Training Data Size': '70%',
 'K-Fold': '5',
 'Machine Learning Task': 'Classification',
 'Data Types': {'Number of times pregnant': 'integer dtype',
  'Plasma glucose concentration': 'int64',
  'Diastolic blood pressure (mm Hg)': 'int64',
  'Triceps skin fold thickness (mm)': 'int64',
  '2-Hour serum insulin (mu U/ml)': 'int64',
  'Body mass index (weight in kg/(height in m)^2)': 'float64',
  'Diabetes pedigree function': 'float64',
  'Age (years)': 'int64',
  'Class variable': 'int64'}}

In [271]:
preprocessing_report_inputs

{'raw_shape': [768, 9],
 'raw_dtype': {'Number of times pregnant': 'int64',
  'Plasma glucose concentration': 'int64',
  'Diastolic blood pressure (mm Hg)': 'int64',
  'Triceps skin fold thickness (mm)': 'int64',
  '2-Hour serum insulin (mu U/ml)': 'int64',
  'Body mass index (weight in kg/(height in m)^2)': 'float64',
  'Diabetes pedigree function': 'float64',
  'Age (years)': 'int64',
  'Class variable': 'int64'},
 'Remove_duplicate_shape': [768, 9],
 'col_drop_70_na': [768, 9],
 'na_summary': {'Number of times pregnant': 0,
  'Plasma glucose concentration': 0,
  'Diastolic blood pressure (mm Hg)': 0,
  'Triceps skin fold thickness (mm)': 0,
  '2-Hour serum insulin (mu U/ml)': 0,
  'Body mass index (weight in kg/(height in m)^2)': 0,
  'Diabetes pedigree function': 0,
  'Age (years)': 0,
  'Class variable': 0},
 'impute_graph_data': None,
 'impute_method': 'mean',
 'norm_report': {'scale_ratio': 347.823894725702,
  'range_ratio': 361.22971818958155,
  'mean_std_ratio': 256.1996241732

In [272]:
preprocessing_inputs

{'transformation_req': True,
 'normalization_req': True,
 'imput_method': 'mean',
 'outliers_req': False,
 'feature_sele_req': False,
 'multicollinearity_req': False,
 'target_imbalance_req': False,
 'transform_target_req': None}

## loading pickel file

In [273]:
assets_folder = "data"
pkl_file_path_raw = os.path.join(assets_folder, "df_dtype_converted.pkl")
df_raw = pd.read_pickle(pkl_file_path_raw)

In [274]:
final_data_types = df_raw.dtypes.to_dict()

## Correlation meta data

In [275]:
# Correlation

correlation_matrix = df_raw.drop(columns=[user_inputs['Target Column']]).select_dtypes(include = ['number']).corr()
np.fill_diagonal(correlation_matrix.values, 0)

In [276]:
pd.DataFrame(preprocessing_report_inputs['outliers_count'])

Unnamed: 0,Feature,Lower_count,Upper_count
0,Number of times pregnant,0,4
1,Plasma glucose concentration,5,0
2,Diastolic blood pressure (mm Hg),38,7
3,Triceps skin fold thickness (mm),0,1
4,2-Hour serum insulin (mu U/ml),0,34
5,Body mass index (weight in kg/(height in m)^2),11,8
6,Diabetes pedigree function,0,29
7,Age (years),0,9


## Llama2 Summary Generation

In [277]:
# Function to generate specific prompts for each stage
def generate_explanation(stage):
    prompts = {
        "Data Set Overview": f"Provide summary of data set which is {final_data_types} like mention feature,datatype, description with one example {df_raw.sample().to_dict(orient='records')}",
        #"Data Set Overview": f"Provide a summary of the dataset '{data}'. Include a list of features along with their data types and a brief description of each. Also, include a sample record from the dataset, for example: {df_raw.sample().to_dict(orient='records')}",
        "Dataset Shape": f"provide summary of Initial dataset shape: {preprocessing_report_inputs['raw_shape']}. After removing duplicate rows: {preprocessing_report_inputs['Remove_duplicate_shape']}. After column validation: {preprocessing_report_inputs['col_drop_70_na']}",
        "Missing_value":f"Provide detailed missing values summary of features based on {preprocessing_report_inputs['na_summary']}, if all values are zero mention no missing values",
        "Normalization_and_Transformation":(f"Provide a detailed justification for both feature transformation and normalization in the preprocessing pipeline. "
                                                                                 f"Feature Normalization is {preprocessing_inputs['normalization_req']}.Provide justication based on this test {preprocessing_report_inputs['norm_report']} "
                                                                                 f"Feature transformation is {preprocessing_inputs['transformation_req']}.Provide justication based on this test {preprocessing_report_inputs['transform_report']}."),
        "Multicollinearity":f"Multicollinearity is {preprocessing_inputs['multicollinearity_req']}.Provide Justification based on correlation matrix {correlation_matrix}. Identify any pairs of features with a correlation coefficient greater than 0.9. If such pairs exist, clearly state 'High multicollinearity detected' else 'not detected'.",
        "Outliers" : f"Provide Summary of Outliers in the dataset for each feature based on {preprocessing_report_inputs['outliers_count']}",
        "Feature_select" : f"""Feature Selection is {preprocessing_inputs['feature_sele_req']}. 

                                ### **Decision Rule**
                                If the number of columns in the dataset is **greater than 15** ({df_raw.shape[1]} > 15), 
                                then **Feature Selection is Required**.
                                Otherwise, Feature Selection **is NOT required**.

                                ### **📌 Conclusion**
                                - **Feature Selection Required**: {preprocessing_inputs['feature_sele_req']}
                                - **Dataset Columns**: {df_raw.shape[1]}

                                **Do NOT assume missing data or additional conditions. Only use the provided dataset.**""",
        #"Target Normality": f"Target Transformation is {transform_target_req} explain based on Shapiro-Wilk statistic{target_stat} and p value {target_p}.",
        "Target Normality": f"""You are a professional data scientist. 
                                Target Transfirnation is {preprocessing_inputs['transform_target_req']} Your task is to provide justification based on test results .

                                ### **📊 Target Transformation Analysis**
                                - **Transformation Required**: {preprocessing_inputs['transform_target_req']}
                                - **Shapiro-Wilk Test Statistic**: {preprocessing_report_inputs['target_transform_report']}

                                ### **🛠 Interpretation**
                                - Explain about Test 
                                - A **p-value < 0.05** suggests the target variable is **not normally distributed**.
                                - If **transformation is required**

                                ### **✅ Recommended Action**
                                Conclude,
                                - If {preprocessing_inputs['transform_target_req']} is **True**, apply an appropriate transformation.
                                - If {preprocessing_inputs['transform_target_req']} is **False**, proceed without transformation.

                                **(Limit response to 150 words)**""",

        "Target_imbalance" : f"Target Imbalance is {preprocessing_inputs['target_imbalance_req']} explain based on the taget value counts {df_raw[user_inputs['Target Column']].value_counts()}.",
        "numeric_imput" : f"Numeric Imputation is {preprocessing_report_inputs['impute_method']}.Provide justification based on {preprocessing_report_inputs['impute_graph_data']} "
        #"Feature Distribution": f"Feature skewness and kurtosis values are: {skewness_kurtosis}. Explain how skewness affects feature distributions and when transformations are necessary."
    }
    
    
    #messages = [
    #{"role": "system", "content": "You are a professional data scientist"},
    #{"role": "user", "content": prompts[stage]}
    #]
    #response = ollama.chat(model="llama2",llama2:chat messages=messages)

    response = ollama.chat(model="llama2:chat", messages=[{"role": "user", "content": prompts[stage]}])
    
    return response['message']['content']

In [278]:
# Generate explanations for each stage
explanations = {
    "Data Set Overview": generate_explanation("Data Set Overview"),
    "Dataset Shape": generate_explanation("Dataset Shape"),
    "Missing_value": generate_explanation("Missing_value"),
    "Normalization_and_Transformation": generate_explanation("Normalization_and_Transformation"),
    "Multicollinearity": generate_explanation("Multicollinearity"),
    "Outliers" : generate_explanation("Outliers"),
    "Feature_select":generate_explanation("Feature_select"),
    "Target Normality": generate_explanation("Target Normality"),
    "Target_imbalance" : generate_explanation("Target_imbalance"),
    "numeric_imput" : generate_explanation("numeric_imput"),
    #"Feature Distribution": generate_explanation("Feature Distribution", skewness_kurtosis)
}

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[267], line 150, in update_outlier_plot(x='sex', y='age')
    143 @app.callback(
    144     dash.dependencies.Output("outlier_plot", "figure"), 
    145     [dash.dependencies.Input("x-axis", "value"), 
   (...)
    148 def update_outlier_plot(x, y):
    149     # Create a box plot with categorical x-axis and numeric y-axis
--> 150     fig = px.box(df_raw, x=x, y=y,notched=True,points='all',title=f"Box Plot: {x} vs {y}")
        fig = Figure({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'Class variable=%{marker.color}<br>Count=%{y}<extra></extra>',
              'legendgroup': '0',
              'marker': {'color': array([0]), 'coloraxis': 'coloraxis', 'pattern': {'shape': ''}},
              'name': '0',
              'offsetgroup': '0',
              'orientation': 'v',
              'showlegend': Tr





In [279]:
features = df_raw.drop(columns=[user_inputs['Target Column']]).select_dtypes(include=['number']).columns
# Dropdown for selecting feature to visualize
feature_dropdown = dcc.Dropdown(
    id='feature-dropdown',
    options=[{'label': col, 'value': col} for col in features],
    value=features[0] if len(features) > 0 else None,
    clearable=False
)

## conditional web content

In [280]:
categorical_cols = list(df_raw.select_dtypes(include='category').columns)
numeric_cols = list(df_raw.select_dtypes(include='number').columns)

# Build outlier controls based on available columns
if categorical_cols:
    outlier_controls = html.Div([
        html.P("x-axis (Category):"),
        dcc.RadioItems(
            id='x-axis',
            options=[{'label': col, 'value': col} for col in categorical_cols],
            value=categorical_cols[0],
            inline=True
        ),
        html.P("y-axis (Numeric):"),
        dcc.RadioItems(
            id='y-axis',
            options=[{'label': col, 'value': col} for col in numeric_cols],
            value=numeric_cols[0],
            inline=True
        )
    ])
else:
    outlier_controls = html.Div([
        html.P("Select Numeric Column:"),
        dcc.Dropdown(
            id='single-numeric',
            options=[{'label': col, 'value': col} for col in numeric_cols],
            value=numeric_cols[0] if numeric_cols else None,
            clearable=False
        )
    ])

In [281]:
# Target Transform

target_transform = (dbc.Row([
        html.H4("Traget Transformation"),
        dcc.Graph(id ='Target_transform'),
        dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
        dbc.Row([dbc.Col(dcc.Markdown(explanations["Target Normality"])),])
    ]) if user_inputs['Machine Learning Task'] == 'Regression' else None )

In [282]:
# Target Imbalance(Classification)

target_imbalance = (dbc.Row([
        html.H4("Traget Imbalance"),
        dcc.Graph(id ='Target_imbalance'),
        dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
        dbc.Row([dbc.Col(dcc.Markdown(explanations["Target_imbalance"])),])
    ]) if user_inputs['Machine Learning Task'] == 'Classification' else None )

In [283]:
# Imutation Method

numeric_imputation = (dbc.Row([
        html.H4("Numeric Imputations"),
        dcc.Graph(id ='numeric_imput'),
        dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
        dbc.Row([dbc.Col(dcc.Markdown(explanations["numeric_imput"])),])
    ]) if df_raw.select_dtypes(include='number').isna().any().any() else None )

## dash App

In [284]:
# Dash Layout
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H2(f"ML Data Preprocessing Report - {user_inputs['Machine Learning Task']}" ), className="text-center my-3")),
    
    dbc.Row([html.H4("Data Set Overview"), dbc.Col(dcc.Markdown(explanations["Data Set Overview"])),]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["Dataset Shape"])),]),
    dbc.Row([
        dbc.Col(html.H4("Missing Values"))]),
    dbc.Row(
        dbc.Col(dcc.Graph(id = 'missing_value'))
        ),
    dcc.Interval(id='interval-component', interval=1*1000, n_intervals=0, max_intervals=1),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["Missing_value"])),]),
    numeric_imputation,
     # Feature Distribution Section
    dbc.Row(
        dbc.Col(html.H4("Feature Distribution"))
    ),
    dbc.Row(
        dbc.Col(feature_dropdown)
    ),
    dbc.Row(
        dbc.Col(dcc.Graph(id='feature-distribution'))
    ),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["Normalization_and_Transformation"])),]),
    dbc.Row([
        dbc.Col(html.H4("Multicolinearity"))]),
    dbc.Row(
        dbc.Col(dcc.Graph(id = 'feature-correlation'))
        ),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["Multicollinearity"])),]),
    
    dbc.Row([
        html.H4("Analysis of Outliers"),
        outlier_controls,
        dcc.Graph(id="outlier_plot")
    ]),
    dbc.Row([dbc.Col(dcc.Markdown(explanations["Outliers"])),]),      
    dbc.Row([html.H4("Feature Selection"), dbc.Col(dcc.Markdown(explanations["Feature_select"])),]),
    target_transform,
    #dbc.Row([dbc.Col(dcc.Markdown(explanations["Target Normality"])),]),
    target_imbalance,    
])       
             


# Callback for updating histogram
@app.callback(    
    dash.dependencies.Output('missing_value','figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

#def update_histogram(selected_feature):
#    fig = px.histogram(df_raw, x=selected_feature, nbins=20, title=f"{selected_feature} Distribution")
#    return fig


def update_missing_value(n_intervals):
    # Create the missing values matrix plot using Missingno and Matplotlib
    fig, ax = plt.subplots(figsize=(8, 4))
    msno.matrix(df_raw, ax=ax)
    
    # Save the plot to a buffer as a PNG image
    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight")
    plt.close(fig)  # Free memory
    buf.seek(0)
    
    # Encode the image in base64
    img_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
    
    # Create a Plotly figure that embeds the image
    plotly_fig = {
        "data": [{
            "x": [0],
            
            "y": [0],
            "type": "scatter",
            "mode": "markers",
            "marker": {"opacity": 0}  # Invisible trace as a placeholder
        }],
        "layout": {
            "images": [{
                "source": "data:image/png;base64," + img_base64,
                "xref": "paper",
                "yref": "paper",
                "x": 0.5,
                "y": 0.5,
                "sizex": 1,
                "sizey": 1,
                "xanchor": "center",
                "yanchor": "middle"
            }],
            "xaxis": {"visible": False},
            "yaxis": {"visible": False},
            "margin": {"l": 0, "r": 0, "t": 0, "b": 0},
            "height": 400
        }
    }
    return plotly_fig

# Callback for updating the feature distribution plot with KDE overlay
@app.callback(
    dash.dependencies.Output('feature-distribution', 'figure'),
    [dash.dependencies.Input('feature-dropdown', 'value')]
)
def update_feature_distribution(selected_feature):
    # Prepare the data: drop missing values for the selected feature
    feature_data = [df_raw[selected_feature].dropna().tolist()]
    group_labels = [selected_feature]
    
    # Create a distribution plot (histogram with KDE)
    fig = ff.create_distplot(feature_data, group_labels, show_hist=True, show_rug=False)
    fig.update_layout(
        title=f"Distribution of {selected_feature}",
        xaxis_title=selected_feature,
        yaxis_title="Density"
    )
    return fig

@app.callback(
    dash.dependencies.Output('feature-correlation', 'figure'),
    [dash.dependencies.Input('interval-component', 'n_intervals')]
)

def feature_correlation(n_intervals):
    
    correlation_matrix = df_raw.drop(columns=[user_inputs['Target Column']]).select_dtypes(include = ['number']).corr()

    # Create an interactive heatmap of the correlation matrix
    fig = px.imshow(
        correlation_matrix,
        text_auto=True,  # Automatically display correlation coefficients
        color_continuous_scale='RdBu_r',  # Diverging color scale (red-blue reversed)
        title="Correlation Matrix for Features")

    fig.update_layout(width=800, height=800)
    
    return fig

if categorical_cols:
    @app.callback(
        dash.dependencies.Output("outlier_plot", "figure"), 
        [dash.dependencies.Input("x-axis", "value"), 
         dash.dependencies.Input("y-axis", "value")]
    )
    def update_outlier_plot(x, y):
        # Create a box plot with categorical x-axis and numeric y-axis
        fig = px.box(df_raw, x=x, y=y,notched=True,points='all',title=f"Box Plot: {x} vs {y}")
        fig.update_layout(height=600)
        return fig
else:
    @app.callback(
        dash.dependencies.Output("outlier_plot", "figure"), 
        [dash.dependencies.Input("single-numeric", "value")]
    )
    def update_outlier_plot(single_numeric):
        # When no categorical columns are available, show the distribution of the numeric column
        fig = px.box(df_raw, y=single_numeric,notched=True,points='all', title=f"Box Plot of {single_numeric}")
        fig.update_layout(height=600)
        return fig

if user_inputs['Machine Learning Task'] == 'Regression':
    
    @app.callback(
        dash.dependencies.Output('Target_transform', 'figure'),
        [dash.dependencies.Input('interval-component', 'n_intervals')]
    )

    def Target_transform(n_intervals):
            
        data = df_raw[user_inputs['Target Column']].dropna().tolist()

        fig =  px.histogram(x = data,marginal = 'box',
                            title=f"Target ({user_inputs['Target Column']}) Distribution", 
                            labels={'x': user_inputs['Target Column']})
        fig.update_layout(height = 600)
        return fig
    
elif user_inputs['Machine Learning Task'] == 'Classification':
    @app.callback(
        dash.dependencies.Output('Target_imbalance', 'figure'),
        [dash.dependencies.Input('interval-component', 'n_intervals')]
    )
    
    def target_imbalance(n_intervals):
        target_col = user_inputs['Target Column']

        # Aggregate the counts of each target class
        counts = df_raw[target_col].value_counts().reset_index()
        counts.columns = [target_col, 'Count']

        # Create a bar chart using the aggregated data
        fig = px.bar(counts,
                    x=target_col,
                    y='Count',
                    pattern_shape=target_col,
                    color=target_col,
                    title="Target Class Imbalance Check")
        return fig
        
if df_raw.select_dtypes(include='number').isna().any().any():
    @app.callback(
        dash.dependencies.Output('numeric_imput', 'figure'),
        [dash.dependencies.Input('interval-component', 'n_intervals')]
    )
    
    def numeric_imput(n_intervals):
        
        # Given string data with imputation methods and RMSE values
        data_str = preprocessing_report_inputs['impute_graph_data']

        # Strip any leading/trailing whitespace/newlines
        data_str = data_str.strip()

        # Convert the string to a DataFrame using a regex separator (2 or more spaces) and treat the first column as the index
        df = pd.read_csv(StringIO(data_str), sep=r'\s{2,}', engine='python', index_col=0)

        # Reset index to drop the original index column
        df.reset_index(drop=True, inplace=True)

        # Optional: inspect the DataFrame
        print(df)

        # Create a bar chart using Plotly Express
        fig = px.bar(df, 
                    x='Imputation Method', 
                    y='RMSE', 
                    title='RMSE by Imputation Method',
                    text='RMSE')

        # Format the text on the bars
        fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
        fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
        
        return fig
        
    
# Run App
if __name__ == "__main__":
    app.run_server(debug=True,port = 8085)


---------------------------------------------------------------------------
DuplicateIdError                          Traceback (most recent call last)
DuplicateIdError: Duplicate component id found in the initial layout: `interval-component`





















