In [None]:
!unzip /content/housing.csv.zip


In [None]:
import pandas as pd
# Load the dataset
df = pd.read_csv('/content/housing.csv')

# Display the first few rows
df.head()


In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    df = df.drop_duplicates()



In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Fill missing values with the median (for numerical columns)
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())



In [None]:
# Check for missing values again
print(df.isnull().sum())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Create boxplots for each numerical column
for column in numerical_columns:
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.show()


In [None]:
# Histogram for 'median_income'
plt.figure(figsize=(8,6))
sns.histplot(df['median_income'], bins=30, kde=True)
plt.title('Histogram of Median Income')
plt.xlabel('Median Income')
plt.ylabel('Frequency')
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# List of numerical columns to scale
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Scale the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Display the first few rows to confirm
df.head()



In [None]:
import pandas as pd

# Assuming 'ocean_proximity' is a categorical column
# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=['ocean_proximity'])

# Display the first few rows of the encoded DataFrame
df_encoded.head()
# Display column names
print(df.columns)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for a numerical column, e.g., 'median_house_value'
plt.figure(figsize=(10, 6))
sns.histplot(df_encoded['median_house_value'], kde=True)
plt.title('Distribution of Median House Value')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Bar plot for the categorical column 'ocean_proximity'
plt.figure(figsize=(10, 6))
sns.countplot(y='ocean_proximity', data=df, order=df['ocean_proximity'].value_counts().index)
plt.title('Frequency of Ocean Proximity Categories')
plt.xlabel('Count')
plt.ylabel('Ocean Proximity')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot between 'median_income' and 'median_house_value'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_income', y='median_house_value', data=df_encoded)
plt.title('Median Income vs. Median House Value')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.show()


In [None]:
# Box plot for 'median_house_value' by 'ocean_proximity'
plt.figure(figsize=(12, 6))
sns.boxplot(x='ocean_proximity', y='median_house_value', data=df)
plt.title('Median House Value by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median House Value')
plt.show()