# Setup

In [48]:
"""
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install matplotlib
%pip install seaborn
"""

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import plot_tree
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('California_Housing.csv')

# Evaluation

In [49]:
def evaluation(y_test, y_pred):
    # Use the model to make predictions on the testing set
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    
    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)
    
    # Print the evaluation metrics
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R-squared:', r2) # The closer our R-squared score gets to 1 the better the model is trained
    
    # Return the evaluation metrics as a dictionary
    return {'MAE': mae, 'MSE': mse, 'R2': r2}

# Data Preperation

## 1) Get an overview over the dataset

Try to get an overview over your data. Find out with what kind of objects and data types your working with. Are there any missing values and or duplicate entrys? How could you fix these anomalys and improve your dataset?

In [None]:
df.head()

## 2) Data cleaning

Try to clean-up your data. Delete all entrys with missing values and try to use only numerical data types in your dataset.

## 3) Correlations

Create a plot with all correlations from this dataset.

Hint: Use the pandas.corr() function and create a heatmap in either seaborn or matplotlib

## 4) Data visualization
Create a new scatterplot using matplotlib or seaborn and visualize the entrys on a map. Create different scatterplots for the actuall house prices, average income and the ocean proximity.

# Machine Learning Models

In [61]:
# Split the dataset into features and target variable
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Split the dataset into training and testing sets. In this case 20% test & 80% training.
# Can this value change the results of your model?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

## Decision Tree

In [None]:
# Create the Decision Tree model
dt = DecisionTreeRegressor(max_depth=4, random_state=42)

# Train the model on the training set
dt.fit(X_train, y_train)

# Predictions 
y_pred_dt = dt.predict(X_test)

# Evaluate the model
evaluation(y_test, y_pred_dt)

# Visualize the decision tree
plt.figure(figsize=(20,10), dpi=300)
plot_tree(dt, filled=True, feature_names=list(X.columns))
plt.savefig('decision_tree.png', dpi=300)
plt.show()

## Linear Regression

In [None]:
# Create the Linear Regression model
lr = LinearRegression()

# Train the model on the training set
lr.fit(X_train, y_train)

# Predictions 
y_pred_lr = lr.predict(X_test)

# Evaluate the model
evaluation(y_test, y_pred_lr)

# Visualize the predicted prices versus the actual prices
plt.scatter(y_test, y_pred_lr, alpha=0.5)
plt.plot(np.linspace(np.min(y_test), np.max(y_test)), np.linspace(np.min(y_test), np.max(y_test)), '--r')
plt.xlabel("Actual House Prices")
plt.ylabel("Predicted House Prices")
plt.title("Actual vs. Predicted House Prices")
plt.show()