# Statistical Analysis Demo

This notebook demonstrates various statistical methods and visualizations using Python. We'll explore:

- Descriptive statistics
- Probability distributions  
- Correlation analysis
- Hypothesis testing
- Data visualization techniques

Let's start by importing the necessary libraries and loading a sample dataset.

## 1. Import Required Libraries

We'll import the essential libraries for statistical analysis and visualization.

In [None]:
# Import essential libraries for data analysis and visualization

import pandas as pd            # For data manipulation and analysis (DataFrames)
import numpy as np             # For numerical operations and array handling
import matplotlib.pyplot as plt # For creating static visualizations/plots
import seaborn as sns          # For advanced statistical data visualization
import scipy.stats as stats    # For statistical functions and tests

# Import specific statistical test functions from scipy.stats
from scipy.stats import pearsonr, spearmanr, ttest_ind, f_oneway

import warnings                # For managing warning messages
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Set default style for matplotlib plots
plt.style.use('default')

# Set color palette for seaborn plots
sns.set_palette("husl")

print("Libraries imported successfully!")  # Confirmation message

## 2. Load Sample Dataset

We'll use the built-in Tips dataset from Seaborn, which contains information about restaurant tips.

In [None]:
# Load the 'tips' dataset from seaborn's built-in datasets
df = sns.load_dataset('tips')

# Display the shape of the dataset (number of rows and columns)
print("Dataset shape:", df.shape)

# Print the column names and their data types
print("\nColumn names and types:")
print(df.dtypes)

# Show the first 5 rows of the dataset for a quick preview
print("\nFirst 5 rows:")
print(df.head())

# Display detailed information about the DataFrame, including non-null counts and data types
print("\nDataset info:")
print(df.info())

## 3. Descriptive Statistics

Let's calculate basic descriptive statistics for our numerical variables.

In [None]:
# Basic descriptive statistics for the entire DataFrame
print("=== Descriptive Statistics ===")
print(df.describe())  # Shows count, mean, std, min, 25%, 50%, 75%, max for numerical columns

print("\n=== Detailed Statistics for Key Variables ===")

# List of numerical columns to analyze in detail
numerical_cols = ['total_bill', 'tip', 'size']

# Loop through each numerical column and print detailed statistics
for col in numerical_cols:
    print(f"\n--- {col.upper()} ---")
    data = df[col]  # Extract the column data as a Series

    # Calculate and print the mean (average)
    print(f"Mean: {data.mean():.3f}")

    # Calculate and print the median (middle value)
    print(f"Median: {data.median():.3f}")

    # Calculate and print the mode (most frequent value)
    print(f"Mode: {data.mode().iloc[0]:.3f}")

    # Calculate and print the standard deviation (spread of data)
    print(f"Standard Deviation: {data.std():.3f}")

    # Calculate and print the variance (square of std deviation)
    print(f"Variance: {data.var():.3f}")

    # Calculate and print the skewness (asymmetry of distribution)
    print(f"Skewness: {stats.skew(data):.3f}")

    # Calculate and print the kurtosis (tailedness of distribution)
    print(f"Kurtosis: {stats.kurtosis(data):.3f}")

    # Calculate and print the range (max - min)
    print(f"Range: {data.max() - data.min():.3f}")

    # Calculate and print the interquartile range (IQR: Q3 - Q1)
    print(f"IQR: {data.quantile(0.75) - data.quantile(0.25):.3f}")

## 4. Correlation Analysis

Let's examine the relationships between numerical variables using different correlation methods.

In [None]:
# Correlation analysis
print("=== Correlation Matrix ===")

# Pearson correlation matrix
corr_matrix = df[numerical_cols].corr()
print("Pearson Correlation Matrix:")
print(corr_matrix)

# Specific correlation calculations with p-values
print("\n=== Detailed Correlation Analysis ===")

# Pearson correlation between total_bill and tip
pearson_corr, pearson_p = pearsonr(df['total_bill'], df['tip'])
print(f"Pearson correlation (total_bill vs tip): {pearson_corr:.3f}, p-value: {pearson_p:.3f}")

# Spearman correlation between total_bill and tip
spearman_corr, spearman_p = spearmanr(df['total_bill'], df['tip'])
print(f"Spearman correlation (total_bill vs tip): {spearman_corr:.3f}, p-value: {spearman_p:.3f}")

# Create correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 5. Hypothesis Testing

Let's perform various hypothesis tests to compare groups and test statistical significance.

In [None]:
# Hypothesis Testing
print("=== Hypothesis Testing ===")

# 1. T-test: Compare tips between smokers and non-smokers
smoker_tips = df[df['smoker'] == 'Yes']['tip']
non_smoker_tips = df[df['smoker'] == 'No']['tip']

t_stat, t_p_value = ttest_ind(smoker_tips, non_smoker_tips)

print("1. Independent T-test: Tips by Smoking Status")
print(f"   Smokers (n={len(smoker_tips)}): Mean tip = ${smoker_tips.mean():.2f}")
print(f"   Non-smokers (n={len(non_smoker_tips)}): Mean tip = ${non_smoker_tips.mean():.2f}")
print(f"   T-statistic: {t_stat:.3f}")
print(f"   P-value: {t_p_value:.3f}")
print(f"   Result: {'Significant' if t_p_value < 0.05 else 'Not significant'} difference")

# 2. ANOVA: Compare total bills across different days
day_groups = [df[df['day'] == day]['total_bill'] for day in df['day'].unique()]
f_stat, anova_p_value = f_oneway(*day_groups)

print(f"\n2. One-way ANOVA: Total Bills by Day")
print(f"   F-statistic: {f_stat:.3f}")
print(f"   P-value: {anova_p_value:.3f}")
print(f"   Result: {'Significant' if anova_p_value < 0.05 else 'Not significant'} difference between days")

# Show group means
print("   Group means:")
for day in df['day'].unique():
    day_mean = df[df['day'] == day]['total_bill'].mean()
    day_count = len(df[df['day'] == day])
    print(f"   {day}: ${day_mean:.2f} (n={day_count})")

# 3. Chi-square test: Independence between smoking and gender
contingency_table = pd.crosstab(df['smoker'], df['sex'])
chi2, chi2_p, dof, expected = stats.chi2_contingency(contingency_table)

print(f"\n3. Chi-square Test: Independence between Smoking and Gender")
print("   Contingency Table:")
print(contingency_table)
print(f"   Chi-square statistic: {chi2:.3f}")
print(f"   P-value: {chi2_p:.3f}")
print(f"   Degrees of freedom: {dof}")
print(f"   Result: Variables are {'dependent' if chi2_p < 0.05 else 'independent'}")

## 6. Data Visualization: Histograms

Histograms help us understand the distribution of numerical variables.

In [None]:
# Create histograms for numerical variables
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distribution of Numerical Variables', fontsize=16, fontweight='bold')

# Total Bill histogram
axes[0, 0].hist(df['total_bill'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(df['total_bill'].mean(), color='red', linestyle='--', 
                   label=f'Mean: ${df["total_bill"].mean():.2f}')
axes[0, 0].axvline(df['total_bill'].median(), color='green', linestyle='--', 
                   label=f'Median: ${df["total_bill"].median():.2f}')
axes[0, 0].set_title('Total Bill Distribution')
axes[0, 0].set_xlabel('Total Bill ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Tip histogram
axes[0, 1].hist(df['tip'], bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
axes[0, 1].axvline(df['tip'].mean(), color='red', linestyle='--', 
                   label=f'Mean: ${df["tip"].mean():.2f}')
axes[0, 1].axvline(df['tip'].median(), color='green', linestyle='--', 
                   label=f'Median: ${df["tip"].median():.2f}')
axes[0, 1].set_title('Tip Distribution')
axes[0, 1].set_xlabel('Tip ($)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Size histogram
axes[1, 0].hist(df['size'], bins=range(1, 8), alpha=0.7, color='lightgreen', 
                edgecolor='black', align='left')
axes[1, 0].set_title('Party Size Distribution')
axes[1, 0].set_xlabel('Party Size')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_xticks(range(1, 7))

# Tip percentage histogram
df['tip_percentage'] = (df['tip'] / df['total_bill']) * 100
axes[1, 1].hist(df['tip_percentage'], bins=20, alpha=0.7, color='gold', edgecolor='black')
axes[1, 1].axvline(df['tip_percentage'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["tip_percentage"].mean():.1f}%')
axes[1, 1].set_title('Tip Percentage Distribution')
axes[1, 1].set_xlabel('Tip Percentage (%)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# Additional: Overlaid histograms by category
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
for smoker in df['smoker'].unique():
    subset = df[df['smoker'] == smoker]['tip']
    plt.hist(subset, alpha=0.6, label=f'Smoker: {smoker}', bins=15)
plt.xlabel('Tip ($)')
plt.ylabel('Frequency')
plt.title('Tip Distribution by Smoking Status')
plt.legend()

plt.subplot(1, 2, 2)
for gender in df['sex'].unique():
    subset = df[df['sex'] == gender]['total_bill']
    plt.hist(subset, alpha=0.6, label=gender, bins=15)
plt.xlabel('Total Bill ($)')
plt.ylabel('Frequency')
plt.title('Total Bill Distribution by Gender')
plt.legend()

plt.tight_layout()
plt.show()

## 7. Data Visualization: Boxplots

Boxplots show the distribution of data through quartiles and help identify outliers.

In [None]:
# Create boxplots to show distributions and outliers
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Boxplot Analysis', fontsize=16, fontweight='bold')

# Simple boxplots of numerical variables
axes[0, 0].boxplot([df['total_bill'], df['tip']], labels=['Total Bill', 'Tip'])
axes[0, 0].set_title('Total Bill vs Tip Distribution')
axes[0, 0].set_ylabel('Amount ($)')

# Boxplot of tips by day using seaborn
sns.boxplot(data=df, x='day', y='tip', ax=axes[0, 1])
axes[0, 1].set_title('Tip Distribution by Day')
axes[0, 1].set_ylabel('Tip ($)')
axes[0, 1].tick_params(axis='x', rotation=45)

# Boxplot of total bill by gender and smoker status
sns.boxplot(data=df, x='sex', y='total_bill', hue='smoker', ax=axes[1, 0])
axes[1, 0].set_title('Total Bill by Gender and Smoking Status')
axes[1, 0].set_ylabel('Total Bill ($)')

# Boxplot of tip percentage by time of day
sns.boxplot(data=df, x='time', y='tip_percentage', ax=axes[1, 1])
axes[1, 1].set_title('Tip Percentage by Time of Day')
axes[1, 1].set_ylabel('Tip Percentage (%)')

plt.tight_layout()
plt.show()

# Additional: Violin plots for better distribution visualization
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.violinplot(data=df, x='day', y='total_bill')
plt.title('Total Bill Distribution by Day (Violin Plot)')
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
sns.violinplot(data=df, x='time', y='tip', hue='smoker')
plt.title('Tip Distribution by Time and Smoking Status')

plt.subplot(1, 3, 3)
sns.violinplot(data=df, y='tip_percentage')
plt.title('Overall Tip Percentage Distribution')

plt.tight_layout()
plt.show()

# Identify outliers in total_bill
Q1 = df['total_bill'].quantile(0.25)
Q3 = df['total_bill'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['total_bill'] < lower_bound) | (df['total_bill'] > upper_bound)]
print(f"\nOutlier Analysis for Total Bill:")
print(f"Number of outliers: {len(outliers)}")
print(f"Outlier threshold: < ${lower_bound:.2f} or > ${upper_bound:.2f}")
if len(outliers) > 0:
    print("Outlier values:")
    print(outliers[['total_bill', 'tip', 'day', 'time']].to_string())

## 8. Data Visualization: Scatter Plots

Scatter plots help visualize relationships between two numerical variables.

In [None]:
# Create scatter plots to explore relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Scatter Plot Analysis', fontsize=16, fontweight='bold')

# Basic scatter plot: Total Bill vs Tip
axes[0, 0].scatter(df['total_bill'], df['tip'], alpha=0.6, color='blue')
axes[0, 0].set_xlabel('Total Bill ($)')
axes[0, 0].set_ylabel('Tip ($)')
axes[0, 0].set_title('Total Bill vs Tip')

# Add regression line
z = np.polyfit(df['total_bill'], df['tip'], 1)
p = np.poly1d(z)
axes[0, 0].plot(df['total_bill'], p(df['total_bill']), "r--", alpha=0.8)

# Scatter plot colored by category: Total Bill vs Tip by Smoker
for smoker, color in zip(df['smoker'].unique(), ['red', 'blue']):
    subset = df[df['smoker'] == smoker]
    axes[0, 1].scatter(subset['total_bill'], subset['tip'], 
                       alpha=0.6, label=f'Smoker: {smoker}', color=color)
axes[0, 1].set_xlabel('Total Bill ($)')
axes[0, 1].set_ylabel('Tip ($)')
axes[0, 1].set_title('Total Bill vs Tip by Smoking Status')
axes[0, 1].legend()

# Scatter plot: Party Size vs Total Bill
axes[1, 0].scatter(df['size'], df['total_bill'], alpha=0.6, color='green')
axes[1, 0].set_xlabel('Party Size')
axes[1, 0].set_ylabel('Total Bill ($)')
axes[1, 0].set_title('Party Size vs Total Bill')
axes[1, 0].set_xticks(range(1, 7))

# Scatter plot: Total Bill vs Tip Percentage
axes[1, 1].scatter(df['total_bill'], df['tip_percentage'], alpha=0.6, color='purple')
axes[1, 1].set_xlabel('Total Bill ($)')
axes[1, 1].set_ylabel('Tip Percentage (%)')
axes[1, 1].set_title('Total Bill vs Tip Percentage')

plt.tight_layout()
plt.show()

# Advanced scatter plots using seaborn
plt.figure(figsize=(15, 10))

# Subplot 1: Scatter plot with regression line and confidence interval
plt.subplot(2, 2, 1)
sns.scatterplot(data=df, x='total_bill', y='tip', hue='time', style='smoker', s=100)
sns.regplot(data=df, x='total_bill', y='tip', scatter=False, color='black')
plt.title('Total Bill vs Tip (by Time and Smoking Status)')

# Subplot 2: Pair plot matrix (simplified)
plt.subplot(2, 2, 2)
numeric_df = df[['total_bill', 'tip', 'size']].sample(50)  # Sample for clarity
pd.plotting.scatter_matrix(numeric_df, ax=plt.gca(), alpha=0.6, figsize=(6, 6))
plt.title('Scatter Matrix (Sample)')

# Subplot 3: Bubble chart (size represents party size)
plt.subplot(2, 2, 3)
scatter = plt.scatter(df['total_bill'], df['tip'], s=df['size']*50, 
                     c=df['tip_percentage'], cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Tip Percentage (%)')
plt.xlabel('Total Bill ($)')
plt.ylabel('Tip ($)')
plt.title('Bubble Chart: Bill vs Tip (Size=Party Size, Color=Tip%)')

# Subplot 4: Hexbin plot for density
plt.subplot(2, 2, 4)
plt.hexbin(df['total_bill'], df['tip'], gridsize=20, cmap='Blues')
plt.colorbar(label='Count')
plt.xlabel('Total Bill ($)')
plt.ylabel('Tip ($)')
plt.title('Density Plot: Total Bill vs Tip')

plt.tight_layout()
plt.show()

# Statistical summary of relationships
print("=== Relationship Analysis ===")
print(f"Correlation between Total Bill and Tip: {df['total_bill'].corr(df['tip']):.3f}")
print(f"Correlation between Party Size and Total Bill: {df['size'].corr(df['total_bill']):.3f}")
print(f"Correlation between Total Bill and Tip Percentage: {df['total_bill'].corr(df['tip_percentage']):.3f}")

# Simple linear regression equation
slope, intercept = np.polyfit(df['total_bill'], df['tip'], 1)
print(f"\nLinear Regression: Tip = {slope:.3f} × Total_Bill + {intercept:.3f}")
print(f"R-squared: {df['total_bill'].corr(df['tip'])**2:.3f}")