# Explorative Data Analysis - Example
---
Below you see an example of an EDA.
- Import packages
- Read the dataset
- Display the first few rows of the DataFrame
- Get information about the DataFrame
- Create a summary statistics of the numerical data
- Count non-null value
- Show the correlation between variables
- Visualise the data with heatmaps, histograms and boxplots

Run the code cells one after the other and see what you learn with each step about your data.

In [None]:
# import packages 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the dataset
df = pd.read_csv('../datasets/eda_exercise.csv')

In [None]:
# Display the first few rows of the DataFrame
df.head()

In [None]:
# Get information about the DataFrame
df.info()

In [None]:
# Summary statistics of numerical columns
df.describe()

In [None]:
# Count of non-null values for each column
df.count()

In [None]:
# Correlation matrix
correlation_matrix = df.corr()
correlation_matrix

In [None]:
# Plot correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")

In [None]:
# Plot histograms for numerical variables
numerical_variables = ['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership', 'Yearly Amount Spent']
for variable in numerical_variables:
    plt.figure(figsize=(6, 4))
    plt.hist(df[variable], bins=10)
    plt.title(f"Histogram of {variable}")
    plt.xlabel(variable)
    plt.ylabel("Frequency")

In [None]:
# Box plots for numerical variables
for variable in numerical_variables:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[variable])
    plt.title(f"Boxplot of {variable}")

In [None]:
# Scatter plot between two numerical variables
plt.figure(figsize=(6, 4))
sns.scatterplot(x=df['Yearly Amount Spent'], y=df['Time on App'])

# Display the plots
plt.show()

In [None]:
# Seaborns pairplot allow us to see scatterplots and histograms for all numerical variables in one visualisation
sns.pairplot(df)
plt.title('Pairwise plot')