Visual Python is a GUI-based Python code generator, developed on the Jupyter Lab, Jupyter Notebook and Google Colab as an extension. You can also simply use Visual Python using Visual Python Desktop.
Visual Python is an open source project started for students who struggle with coding during Python classes for data science.

In [2]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
df = pd.read_csv("../../data/yfv/YF_outcomes.csv")
df.head()

Unnamed: 0,patient_id,age,sex,race,ethnicity,outcome
0,0A#D,42,F,White,Not Hispanic or Latino,0
1,0A1B,44,M,White,Not Hispanic or Latino,2
2,0A5F,38,F,White,Not Hispanic or Latino,1
3,0A7B,64,F,White,Not Hispanic or Latino,2
4,0A9B,47,M,White,Not Hispanic or Latino,1


In [4]:
# Visual Python: Data Analysis > File
df = pd.read_csv('../../data/yfv/YF_outcomes.csv')
df

Unnamed: 0,patient_id,age,sex,race,ethnicity,outcome
0,0A#D,42,F,White,Not Hispanic or Latino,0
1,0A1B,44,M,White,Not Hispanic or Latino,2
2,0A5F,38,F,White,Not Hispanic or Latino,1
3,0A7B,64,F,White,Not Hispanic or Latino,2
4,0A9B,47,M,White,Not Hispanic or Latino,1
...,...,...,...,...,...,...
25582,SxyK,26,M,White,Not Hispanic or Latino,0
25583,SxyL,46,M,White,Not Hispanic or Latino,0
25584,SxyM,50,F,White,Not Hispanic or Latino,0
25585,SxyN,56,M,White,Not Hispanic or Latino,0


In [5]:
# Visual Python: Correlation Analysis
def vp_confidence_interval_corr(x, y, method='pearson', alpha=0.05):
    try: x=pd.Series(x); y=pd.Series(y)
    except: return np.nan

    corr_func = {'pearson':stats.pearsonr,'spearman':stats.spearmanr,'kendall':stats.kendalltau}
    se_diff   = {'pearson':3,'spearman':3,'kendall':4}
    se_func   = {'pearson': lambda corr: 1,
                 'spearman':lambda corr: 1 + corr ** 2 / 2.,
                 'kendall': lambda corr: .437 }
                     
    corr, pvalue = corr_func[method](x,y)
    
    z  = np.log((1 + corr) / (1 - corr)) / 2
    se = np.sqrt(se_func[method](corr) / (x.size - se_diff[method]))
    
    z_lower = z - stats.norm.ppf(1 - alpha / 2.) * se
    z_upper = z + stats.norm.ppf(1 - alpha / 2.) * se
    
    corr_lower = (np.exp(2 * z_lower) - 1) / (np.exp(2 * z_lower) + 1)
    corr_upper = (np.exp(2 * z_upper) - 1) / (np.exp(2 * z_upper) + 1)    
    
    return corr, pvalue, corr_lower, corr_upper

In [24]:
# Visual Python: Correlation Analysis
vp_df = df[['sex', 'race']].dropna().copy()

# Correlation Analysis
from scipy import stats
from IPython.display import display, Markdown
_dfr = pd.DataFrame()
for i, col1 in enumerate(vp_df.columns):
    for j, col2 in enumerate(vp_df.columns):
        if i >= j: continue
        if pd.api.types.is_numeric_dtype(vp_df[col1]) and pd.api.types.is_numeric_dtype(vp_df[col2]):
            _res = vp_confidence_interval_corr(vp_df[col1], vp_df[col2], method='pearson')
            _df_t = pd.DataFrame(data={'Variable1':col1,'Variable2':col2,'N':vp_df[col1].size,'Correlation coefficient':_res[0],
                                   'p-value':_res[1],'Lower(95%)':_res[2],'Upper(95%)':_res[3]}, index=[0])
            _dfr = pd.concat([_dfr, _df_t]).reset_index(drop=True)
display(Markdown('### Correlation Analysis: Pearson'))
display(_dfr)

# Correlation matrix: Pearson
from IPython.display import display
display(vp_df.corr(method='pearson', numeric_only=True).round(2))

### Correlation Analysis: Pearson

In [35]:
# Visual Python: Regression
# Simple linear regression
vp_df = df.dropna().copy()

# Simple linear regression
from IPython.display import display, Markdown
import statsmodels.formula.api as smf
# Model - Dependent variable ~ Independent variable
_model  = smf.ols('outcome ~ age', vp_df)
_result = _model.fit()
display(Markdown('### Model - Dependent variable ~ Independent variable'))
print(_result.summary())

# Multi-collinearity statistics
from statsmodels.stats.outliers_influence import variance_inflation_factor
_dfr = pd.DataFrame(_result.summary().tables[1].data[1:],columns=_result.summary().tables[1].data[0]).set_index('')
for i, col in enumerate(_model.exog_names[1:]):
    _vif = variance_inflation_factor(_model.exog, i+1)
    _dfr.loc[col,'Tolerance'] = 1/_vif
    _dfr.loc[col,'VIF'] = _vif
display(_dfr)

# Residual
from IPython.display import display, Markdown
from scipy import stats
import statsmodels.api as sm
_predict  = _result.predict(vp_df)
_residual = _result.resid
vp_residual = pd.DataFrame({'predict':_predict,'residual':_residual,
                            'predict_z':stats.zscore(_predict),'residual_z':stats.zscore(_residual)})
display(Markdown('### Residual'))
display(vp_residual)

# Resisual Normality test (Shapiro-Wilk)
_res = stats.shapiro(vp_residual['residual_z'])
display(Markdown('### Residual Normality test (Shapiro-Wilk)'))
display(pd.DataFrame(data={'Statistic':_res.statistic,'p-value':_res.pvalue},index=['Resisual Normality test (Shapiro-Wilk)']))

import seaborn as sns
import warnings
with warnings.catch_warnings():

    # Residual histogram
    plt.subplot(2,2,1)
    warnings.simplefilter(action='ignore', category=Warning)
    sns.histplot(data=vp_residual, x='residual_z', kde=True)
    plt.title(f'Dependent variable: {_model.endog_names}')
    plt.xlabel('Regression Standardized residual')

    # Residual scatterplot
    plt.subplot(2,2,2)
    sns.scatterplot(data=vp_residual, x='predict_z', y='residual_z')
    plt.title(f'Dependent variable: {_model.endog_names}')
    plt.xlabel('Regression Standardized predicted value')
    plt.ylabel('Regression Standardized residual')

    plt.tight_layout()
    plt.show()

ModuleNotFoundError: No module named 'statsmodels'