In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro

# Load your dataset (replace with your dataset path)
df = pd.read_csv('your_dataset.csv')  # Update this with your dataset file path

# 1. Display basic info about the dataset
print("Dataset Info:")
df.info()

# 2. Display first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(df.head())

# 3. Summary Statistics
print("\nSummary Statistics:")
print(df.describe())

# 4. Handling Missing Values
print("\nMissing Values:")
print(df.isnull().sum())

# Option to impute missing values (mean for numeric, mode for categorical)
# Example: df['column_name'] = df['column_name'].fillna(df['column_name'].mean())
# You can also drop missing rows/columns with df.dropna()

# 5. Distribution of Data (Central Tendency and Dispersion)
print("\nCentral Tendency and Dispersion:")
mean = df.mean()
median = df.median()
std_dev = df.std()
variance = df.var()
print("Mean:\n", mean)
print("\nMedian:\n", median)
print("\nStandard Deviation:\n", std_dev)
print("\nVariance:\n", variance)

# 6. Visualization: Histograms and Box Plots for Distribution
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Option 2: Plot histograms with different colors for each column
colors = ['skyblue', 'lightgreen', 'pink', 'orange', 'lightcoral', 'lightyellow']  # Add more colors if needed

# Plot histograms for all numerical columns with different colors
df[numeric_columns].hist(figsize=(10, 8), bins=15, color=colors[:len(numeric_columns)], edgecolor='black')
plt.suptitle('Histograms of Numerical Features')
plt.show()

# Box Plots for detecting outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=df[numeric_columns])
plt.title('Box Plots of Numerical Features')
plt.xticks(rotation=90)
plt.show()

# 7. Outlier Detection using Z-Score or IQR method
z_scores = np.abs(stats.zscore(df[numeric_columns]))
print("\nZ-scores of Numerical Features:")
print(z_scores)

# 8. Correlation Analysis
print("\nCorrelation Matrix:")
correlation_matrix = df.corr()
print(correlation_matrix)

# Plotting the correlation matrix heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()

# 9. Scatter Plots for pairs of variables
sns.pairplot(df[numeric_columns])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

# 10. Assumptions Testing: Normality Test (Shapiro-Wilk test) for Normality of Data
# Example for one column 'column_name', repeat for others if needed

column_to_test = 'column_name'  # Replace with your column name
stat, p_value = shapiro(df[column_to_test].dropna())
print(f"\nShapiro-Wilk Normality Test for '{column_to_test}':")
print(f"Statistic: {stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data seems to follow a normal distribution (fail to reject null hypothesis).")
else:
    print("The data does not follow a normal distribution (reject null hypothesis).")

# 11. Testing Homoscedasticity (Residual plot for Linear Regression)
# Example for one variable 'X' vs 'Y', replace with actual variables in your dataset
sns.lmplot(x='X', y='Y', data=df, robust=True)
plt.title('Residual Plot: X vs Y')
plt.show()

# 12. Visualizing the Distribution of a Single Feature
# Example for one feature 'column_name', replace with your feature name
sns.displot(df['column_name'], kde=True)
plt.title('Distribution of Column')
plt.show()
