# Exploratory Data Analysis (EDA) for Ames Housing Dataset

## Exploring a dataset that contains information on housing sales in Ames, Iowa. The target variable in this analysis is the sales price.



## 1. Import Libraries and Data Structure


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
import numpy as np


In [None]:
# Set path and open file
csv_path = "/Workspace/Users/iakidwell@uchicago.edu/AmesHousing.csv"

# Load CSV
df = pd.read_csv(csv_path)

# Quick check
print(df.head())

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Time to take a look at numeric and categorical features. If the numeric features perform well in AutoML, then I will build a regression model with just numeric features.

# Numeric features: int64 and float64
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categorical features: object
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print("Numeric features (first 10):", numeric_features[:10])
print("Categorical features (first 10):", categorical_features[:10])


## 2. Check for Missing Values

In [None]:
# Check total missing values per column
missing_values = df.isnull().sum()
print(missing_values)

# Check percentage of missing values per column
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'MissingValues': missing_values,
    'PercentMissing': missing_percent
})

# Show only columns with missing values, sorted by percentage
missing_df = missing_df[missing_df['MissingValues'] > 0].sort_values(by='PercentMissing', ascending=False)
print(missing_df)


## 3. Summary Statistics

In [None]:
# Summary statistics for numeric columns
numeric_summary = df[numeric_features].describe().T  # Transpose for readability
numeric_summary['range'] = numeric_summary['max'] - numeric_summary['min']
numeric_summary


## 4. Distribution Analysis

In [None]:
# Set up a grid for multiple plots. This is a high-level scan of all the features to see if there is anything we should go into more depth on.
num_cols = len(numeric_features)
n_rows = (num_cols // 3) + 1
plt.figure(figsize=(15, n_rows * 4))

for i, col in enumerate(numeric_features, 1):
    plt.subplot(n_rows, 3, i)
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()


In [None]:

# Taking a look in outliers for features.
plt.figure(figsize=(15, n_rows * 4))

for i, col in enumerate(numeric_features, 1):
    plt.subplot(n_rows, 3, i)
    sns.boxplot(x=df[col], color='lightgreen')
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()


## 5. Summary Statistics for Categorical Variables

In [None]:
# Summary statistics for categorical variables
cat_summary = pd.DataFrame(index=categorical_features)

# Count of unique values per column
cat_summary['UniqueValues'] = df[categorical_features].nunique()

# Most frequent category
cat_summary['TopCategory'] = df[categorical_features].mode().iloc[0]

# Frequency of the most frequent category
cat_summary['TopCategoryFreq'] = df[categorical_features].apply(lambda x: x.value_counts().iloc[0])

# Percentage of the most frequent category
cat_summary['TopCategoryPercent'] = round(cat_summary['TopCategoryFreq'] / len(df) * 100, 2)

cat_summary


In [None]:


# Set general style
sns.set(style="whitegrid")

# Loop through categorical features and plot
for col in categorical_features:
    plt.figure(figsize=(8,4))
    sns.countplot(x=col, data=df, order=df[col].value_counts().index, color="skyblue")
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


## 6. Relationship between Features and Target

In [None]:
# --- Numeric features correlation with SalePrice ---
numeric_df = df.select_dtypes(include=['int64','float64'])  # select numeric features
correlations = numeric_df.corr()['SalePrice'].sort_values(ascending=False)

print("Top 10 features most positively correlated with SalePrice:\n", correlations.head(10))
print("\nTop 10 features most negatively correlated with SalePrice:\n", correlations.tail(10))



In [None]:
# Optional: heatmap of top correlations
plt.figure(figsize=(10,6))
top_corr_features = correlations.head(10).index
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Top 10 Numeric Features with SalePrice")
plt.show()

In [None]:
# --- Categorical features vs SalePrice ---
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_features:
    plt.figure(figsize=(10,5))
    sns.boxplot(data=df, x=col, y='SalePrice')
    plt.xticks(rotation=45)
    plt.title(f'SalePrice Distribution by {col}')
    plt.show()

In [None]:
# Subset numeric features from the dataframe
numeric_df = df[numeric_features]  # numeric_features is a list of column names

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(16, 12))

# Draw the heatmap
sns.heatmap(
    corr_matrix,
    annot=True,          # show correlation values
    fmt=".2f",
    cmap="coolwarm",
    cbar=True,
    square=True,
    mask=np.triu(corr_matrix)  # optional: mask upper triangle
)

plt.title("Correlation Matrix of Numeric Features", fontsize=18)
plt.show()

## 7. Identifying Outliers and Flagging Future Issues

In [None]:
# Section 8: Flagging Potential Data Quality Issues

# 1. Check for duplicates
duplicates = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")

# 2. Check for impossible values in numeric columns (negative or zero where not expected)
for col in numeric_features:
    if (df[col] < 0).any():
        print(f"Column {col} has negative values")

# 3. Summary statistics to spot outliers
numeric_df = df[numeric_features]
print("\nSummary statistics to spot potential outliers:")
print(numeric_df.describe())

# 4. Visualize outliers with boxplots
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 8))
sns.boxplot(data=numeric_df)
plt.xticks(rotation=90)
plt.title("Boxplot of Numeric Features - Potential Outliers")
plt.show()

# 5. Check for inconsistent types (categorical features)
for col in categorical_features:
    print(f"\nUnique values in {col}: {df[col].unique()}")

# Optional: Flag columns with very low variance or extreme missingness
low_variance_cols = [col for col in df.columns if df[col].nunique() <= 1]
print(f"\nColumns with low variance (1 unique value): {low_variance_cols}")



## We might need to log transform sale price to make it more normally distributed
