In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [2]:
# Establish connection to the SQLite database
conn = sqlite3.connect('mydatabase.db')

In [3]:
# Example SQL query to retrieve data for EDA
query = """
SELECT *
FROM main_table
"""
df = pd.read_sql_query(query, conn)

# Optionally, load specific subsets or additional queries if needed for specific analyses


In [None]:
# Histogram of the 'age' column
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Boxplot for the 'balance' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['balance'])
plt.title('Box Plot of Balance')
plt.xlabel('Balance')
plt.show()


In [None]:
# Scatter plot between 'age' and 'balance'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='balance', data=df, hue='y')
plt.title('Age vs Balance Scatter Plot')
plt.xlabel('Age')
plt.ylabel('Balance')
plt.show()

# Bar chart of 'job' vs 'balance'
plt.figure(figsize=(14, 7))
sns.barplot(x='job', y='balance', data=df, palette='viridis')
plt.title('Average Balance by Job Type')
plt.xticks(rotation=45)
plt.xlabel('Job')
plt.ylabel('Average Balance')
plt.show()


In [None]:
# Select only numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=[np.number])  # np.number covers all numeric types
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
numeric_df = df.select_dtypes(include=[np.number])

# Now you can safely compute the descriptive statistics and correlation matrix
print("Basic Descriptive Statistics:")
print(numeric_df.describe())

print("\nCorrelation Coefficients:")
print(numeric_df.corr())

In [None]:
print(df.columns)