# California Housing Prices

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
pandas_data_path = Path(
    r"C:\Users\tc\Programming\Python\Courses\Own\python-programmierer\Data\Pandas"
)
california_housing_csv_path = pandas_data_path / "california-housing.csv"

## Loading the data set

Load the csv file `california_housing_csv_path` as
pandas DataFrame. Do not import any columns without information content.

In [None]:
df = pd.read_csv(california_housing_csv_path, index_col=0)

In [None]:
df

How many rows does the DataFrame have?

In [None]:
len(df)

Which columns does the DataFrame have?
Are there columns in the DataFrame that contain undefined values ​​(NA)?

In [None]:
df.columns

In [None]:
# This is true if `notna()` drops rows, i.e., if
df.isna().sum()

What are minimum, maximum, mean and standard deviation of each column?

In [None]:
df.describe()

Generate histograms of the individual columns.
Are there any features that stick out?

In [None]:
df.hist(bins=30, figsize=(12, 8))

Create a new DataFrame containing only the rows whose `Target`
value is greater than 4. How many entries does this DataFrame have?

In [None]:
df_expensive = df[df["Target"] > 4]

In [None]:
df_expensive

Plot longitude vs. latitude as a scatterplot. What can you infer from this
plot?

The `c` keyword argument allows you to specify a column name whose
values determine the color of the output. With `cmap` you can create a colormap
for these colors. With the keyword argument `alpha` you can set the transparency of the output.

Experiment with these values to make the output more informative.

In [None]:
df.plot(
    kind="scatter",
    x="Longitude",
    y="Latitude",
    figsize=(10, 8),
    alpha=0.4,
    c="Target",
    cmap="hot",
)

What does the corresponding output look like for the DataFrame that only
contains expensive houses?

In [None]:
df_expensive.plot(
    kind="scatter",
    x="Longitude",
    y="Latitude",
    figsize=(10, 8),
    alpha=0.4,
    c="Target",
    cmap="hot",
)

If you have installed seaborn you can use `seaborn.pairplot()` to
create a grid of scatterplots with all possible combinations of columns.
With the keyword argument `hue` you can specify a column that contains the
color of the output.

Which of the plots provide interesting information? What values might be interesting as value of `hue`?

*Note:* It is advisable to decrease the number of rows being plotted to e.g. 500 or 1000 to limit the time it takes to generate the plots.

In [None]:
import seaborn as sns

# sns.pairplot(df.iloc[:500])

In [None]:
# sns.pairplot(df.iloc[:500], hue="Target")

In [None]:
# sns.pairplot(df.iloc[:500], hue="MedInc")

In [None]:
# sns.pairplot(df_expensive)

In [None]:
df.columns

In [None]:
np.set_printoptions(precision=2)

In [None]:
x = df[
    [
        "MedInc",
        "HouseAge",
        "AveRooms",
        "AveBedrms",
        "Population",
        "AveOccup",
        "Latitude",
        "Longitude",
    ]
].to_numpy()
y = df["Target"].to_numpy()
x.shape, y.shape

In [None]:
NUM_TRAINING_SAMPLES = 15_000

In [None]:
x_train, x_test = x[:NUM_TRAINING_SAMPLES], x[NUM_TRAINING_SAMPLES:]
y_train, y_test = y[:NUM_TRAINING_SAMPLES], y[NUM_TRAINING_SAMPLES:]
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [None]:
lr_pred = lr_model.predict(x_test)

In [None]:
def print_score(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    print(f"Mean: {np.mean(y_true):.3f}")
    print(f"MAE%: {mae/np.mean(y_true):.3f}")
    print(f"MAE:  {mae:.3f}")
    print(f"MSE:  {mse:.3f}")

In [None]:
print_score(y_test, lr_pred)

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)

In [None]:
dt_pred = dt_model.predict(x_test)

In [None]:
print_score(y_test, dt_pred)

In [None]:
rf_model = RandomForestRegressor(n_jobs=32)
rf_model.fit(x_train, y_train)

In [None]:
rf_pred = rf_model.predict(x_test)

In [None]:
print_score(y_test, rf_pred)

In [None]:
x_train[:3]

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
x_train[:3]

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [None]:
lr_pred = lr_model.predict(x_test)

In [None]:
print_score(y_test, lr_pred)

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)

In [None]:
dt_pred = dt_model.predict(x_test)

In [None]:
print_score(y_test, dt_pred)

In [None]:
rf_model = RandomForestRegressor(n_jobs=32)
rf_model.fit(x_train, y_train)

In [None]:
rf_pred = rf_model.predict(x_test)

In [None]:
print_score(y_test, rf_pred)