# Data Engineering and Data Science

This notebook demonstrates data engineering and data science workflows using Python.

In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


## Load Dataset

In [None]:

# Load real estate sales dataset (example dataset for demonstration)
real_estate_sales = pd.read_csv('aggregated_real_estate_sales.csv')

# Display the first few rows
real_estate_sales.head()


## Data Cleaning and Preprocessing

In [None]:

# Check for missing values
print("Missing Values:")
print(real_estate_sales.isnull().sum())

# Handle missing values (if any) - here, we'll fill with median values as an example
real_estate_sales.fillna(real_estate_sales.median(), inplace=True)

# Verify no missing values remain
print("Missing Values after filling:")
print(real_estate_sales.isnull().sum())


## Feature Engineering

In [None]:

# Example: Add a new feature - Price per Unit
real_estate_sales['Price_per_Unit'] = real_estate_sales['Total_Sales'] / real_estate_sales['Sales_Units']

# Drop unnecessary columns for modeling (e.g., 'Date')
real_estate_sales.drop(columns=['Date'], inplace=True)

# Display the transformed dataset
real_estate_sales.head()


## Train-Test Split

In [None]:

# Define features (X) and target (y)
X = real_estate_sales[['Sales_Units', 'Price_per_Unit']]
y = real_estate_sales['Average_Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


## Build and Train Model

In [None]:

# Initialize and train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Display model coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)


## Model Evaluation

In [None]:

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


## Summary and Insights

- The notebook demonstrates a basic workflow for data engineering and data science.
- A Linear Regression model was built to predict the average price of properties based on engineered features.