In [57]:
import pandas as pd #data analysis and manipulation
import numpy as np #math equations
import plotly.graph_objects as g0 #for 3d visualization
import plotly.express as px #for 3d visualization
from scipy.stats import norm, skew #this tool is used for statisticall analysis
#norm - stands for normal distribution(used in histogram)
#skew - stands for skewness (will be used in histogram only)
import seaborn as sns #mix of matplotlib and pandas

In [58]:
#Dataset Loading and Preparation
df1 = sns.load_dataset('penguins').dropna()
df1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [59]:
df2 = sns.load_dataset('iris').dropna()
df2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [60]:
df3 = sns.load_dataset('planets').dropna()
df3.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [61]:
#Selecting only numeric columns for penguins dataset  processing
numeric_cols1 = df1.select_dtypes(include='number').columns
print(numeric_cols1)

Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')


In [62]:
#Selecting only numeric columns for iris dataset processing
numeric_cols2 = df2.select_dtypes(include='number').columns
print(numeric_cols2)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')


In [63]:
#Selecting only numeric columns for planets dataset processing
numeric_cols3 = df3.select_dtypes(include='number').columns
print(numeric_cols3)

Index(['number', 'orbital_period', 'mass', 'distance', 'year'], dtype='object')


### **HISTOGRAM WITH NORMAL DISTRIBUTION AND ITS SKEWNESS**

In [64]:
#histogram for penguins dataset
for col in numeric_cols1:
  x1 = df1[col]
  mean, std = x1.mean(), x1.std()

  #plotting histogram with density curve
  hist = g0.Histogram(x = x1, histnorm= 'probability density', name='histogram', opacity=0.6)
  x1_range= np.linspace(x1.min(), x1.max(), 100)

  normal_curve = g0.Scatter(x = x1_range, y= norm.pdf(x1_range,mean,std), mode='lines', name='normal distribution')
  layout = g0.Layout(
      title = f'{col.capitalize()}Distribution<br>Skewness=(x.skew():.2f)',
      xaxis_title=col,
      yaxis_title='Density',
      bargap=0.2
)
  fig = g0.Figure(data= [hist, normal_curve], layout=layout)
  fig.show()

In [65]:
#histogram for iris dataset
for col in numeric_cols2:
  x2 = df2[col]
  mean, std = x2.mean(), x2.std()

  #plotting histogram with density curve
  hist = g0.Histogram(x = x2, histnorm= 'probability density', name='histogram', opacity=0.6)
  x2_range= np.linspace(x2.min(), x2.max(), 100)

  normal_curve = g0.Scatter(x = x2_range, y= norm.pdf(x2_range,mean,std), mode='lines', name='normal distribution')
  layout = g0.Layout(
      title = f'{col.capitalize()}Distribution<br>Skewness=(x.skew():.2f)',
      xaxis_title=col,
      yaxis_title='Density',
      bargap=0.2
)
  fig = g0.Figure(data= [hist, normal_curve], layout=layout)
  fig.show()

In [66]:
#histogram for planets dataset
for col in numeric_cols3:
  x3 = df3[col]
  mean, std = x3.mean(), x3.std()

  #plotting histogram with density curve
  hist = g0.Histogram(x = x3, histnorm= 'probability density', name='histogram', opacity=0.6)
  x3_range= np.linspace(x3.min(), x3.max(), 100)

  normal_curve = g0.Scatter(x = x3_range, y= norm.pdf(x3_range,mean,std), mode='lines', name='normal distribution')
  layout = g0.Layout(
      title = f'{col.capitalize()}Distribution<br>Skewness=(x.skew():.2f)',
      xaxis_title=col,
      yaxis_title='Density',
      bargap=0.2
)
  fig = g0.Figure(data= [hist, normal_curve], layout=layout)
  fig.show()

### **CORRELATION MATRIX (HeatMap)**

In [67]:
corr_matrix1 = df1[numeric_cols1].corr() #for penguin dataset
corr_matrix1

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
bill_length_mm,1.0,-0.228626,0.653096,0.589451
bill_depth_mm,-0.228626,1.0,-0.577792,-0.472016
flipper_length_mm,0.653096,-0.577792,1.0,0.872979
body_mass_g,0.589451,-0.472016,0.872979,1.0


In [68]:
corr_matrix2 = df2[numeric_cols2].corr() #for iris dataset
corr_matrix2

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [69]:
corr_matrix3 = df3[numeric_cols3].corr() #for planets dataset
corr_matrix3

Unnamed: 0,number,orbital_period,mass,distance,year
number,1.0,-0.059435,-0.249915,-0.288808,0.110399
orbital_period,-0.059435,1.0,0.184906,-0.035069,0.070186
mass,-0.249915,0.184906,1.0,0.274082,-0.137946
distance,-0.288808,-0.035069,0.274082,1.0,0.193087
year,0.110399,0.070186,-0.137946,0.193087,1.0


In [70]:
#For penguin dataset
fig = px.imshow(
    corr_matrix1,
    text_auto =True,
    color_continuous_scale ='RdBu',
    title="Correlation Matrix of Numerical Features",
    aspect ="auto"
)
fig.update_layout(margin=dict(l=60, r=60, t=50, b=50))
fig.show()

In [71]:
#For iris dataset
fig = px.imshow(
    corr_matrix2,
    text_auto =True,
    color_continuous_scale ='RdBu',
    title="Correlation Matrix of Numerical Features",
    aspect ="auto"
)
fig.update_layout(margin=dict(l=60, r=60, t=50, b=50))
fig.show()

In [72]:
#For planets dataset
fig = px.imshow(
    corr_matrix3,
    text_auto =True,
    color_continuous_scale ='RdBu',
    title="Correlation Matrix of Numerical Features",
    aspect ="auto"
)
fig.update_layout(margin=dict(l=60, r=60, t=50, b=50))
fig.show()

### **BOX-PLOT**


In [73]:
#For Penguins Dataset
categorical_cols1 = 'species'

In [74]:
#For Iris Dataset
categorical_cols2 = 'species'

In [75]:
#For Planets Dataset
categorical_cols3 = 'method'

In [76]:
for col in numeric_cols1:
    fig = px.box(df1, x=categorical_cols1, y=col, points="all", color=categorical_cols1)
    title= f'{col.capitalize()}BoxPlot grouped by{categorical_cols1.capitalize()}'
    notched=True
    fig.update_layout(yaxis_title = col, xaxis_title = categorical_cols1)
    fig.show()

In [77]:
for col in numeric_cols2:
    fig = px.box(df2, x=categorical_cols2, y=col, points="all", color=categorical_cols2)
    title= f'{col.capitalize()}BoxPlot grouped by{categorical_cols2.capitalize()}'
    notched=True
    fig.update_layout(yaxis_title = col, xaxis_title = categorical_cols2)
    fig.show()

In [78]:
for col in numeric_cols3:
    fig = px.box(df3, x=categorical_cols3, y=col, points="all", color=categorical_cols3)
    title= f'{col.capitalize()}BoxPlot grouped by{categorical_cols3.capitalize()}'
    notched=True
    fig.update_layout(yaxis_title = col, xaxis_title = categorical_cols3)
    fig.show()

In [79]:
#Creating dictionary of three datasets
datasets={
    "penguins":df1,
    "iris":df2,
    "planets":df3
}

In [80]:
# Initializing an empty comparison list
summary_stats = []

In [81]:
# Iterate over each dataset
for name, df in datasets.items():
    numeric_cols = df.select_dtypes(include='number')
    categorical_cols = df.select_dtypes(include='object')

    stat = {
        "Dataset": name,
        "Rows": df.shape[0],
        "Columns": df.shape[1],
        "Numeric Features": numeric_cols.shape[1],
        "Categorical Features": categorical_cols.shape[1],
        "Avg Skewness": round(numeric_cols.skew().mean(), 2),
        "Min Correlation": round(numeric_cols.corr().min().min(), 2) if numeric_cols.shape[1] > 1 else "N/A",
        "Max Correlation": round(numeric_cols.corr().max().max(), 2) if numeric_cols.shape[1] > 1 else "N/A"
    }

    summary_stats.append(stat)

# Convert to DataFrame
comparison_df = pd.DataFrame(summary_stats)
print(comparison_df)

    Dataset  Rows  Columns  Numeric Features  Categorical Features  \
0  penguins   333        7                 4                     3   
1      iris   150        5                 4                     1   
2   planets   498        6                 5                     1   

   Avg Skewness  Min Correlation  Max Correlation  
0          0.18            -0.58              1.0  
1          0.06            -0.43              1.0  
2          2.38            -0.29              1.0  
