In [8]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the housing dataset
housing = fetch_california_housing()

In [9]:

X = pd.DataFrame(housing.data, columns=housing.feature_names) 
y = pd.Series(housing.target, name='med_house_value')

In [10]:
#Part 1
# Display the first five rows of the dataset (5 pts)
print("The first five rows of the dataset are:")
print(X.head())

# Print feature names and check for missing values (5 pts)
print("\nFeature names:")
print(X.columns.tolist())

print("\nMissing values in each column:")
print(X.isnull().sum())

# Generate summary statistics (10 pts)
print("\nSummary Statistics:")
print(X.describe())


The first five rows of the dataset are:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

Feature names:
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

Missing values in each column:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

Summary Statistics:
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  2064

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split the dataset into training and test sets (80% training, 20% testing) (5 pts)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model on the unscaled data (5 pts)
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set (5 pts)
y_pred = model.predict(X_test)

# Evaluate model performance (15 pts)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error (MSE): {MSE:.4f}")
print(f"Root Mean Squared Error (RMSE): {RMSE:.4f}")
print(f"R² Score: {R2:.4f}")


Model Performance:
Mean Squared Error (MSE): 0.5559
Root Mean Squared Error (RMSE): 0.7456
R² Score: 0.5758




#### What does the R² score tell us about model performance?
The R² score, which is also called the coefficient of determination, allows us insight into how well the model calculates variance in the desired variable. An R² score of 0.5758 means that our model is only able to explain 57.58% of the variance in hosuehold income. While this suggests a moderate fit, there is still a large portion of variance that the model does not capturesuggesting that this dataset might have a non-linear relationship and be dependent on some other factors. Indisutry standard for acceptable models differ by industry, and because resal estate developers acknowledge the volatile anture of the market, this can be on th elow end of a reliable predictor of variance. 

#### Which features seem to have the strongest impact on predictions based on the model’s coefficients?
To determine which features have the strongest impact, one can look at the model's coefficients. To do this I ran the code below to rank the maginitude of importance of each feature. The higher the absolute value, the hgiher the impact on the prediction, which seems to be Average Bedrooms and Median Income. The psitive and negatives indicate if it impacts it positvely or negatively, but the absoltely value determines the magnitude of the impact on predictions. 

#### How well do the predicted values match the actual values?
The RMSE of 0.7456 provides an estimate of the average prediction error in the same scale as the target variable. This means that the predictions produced by this model are $74,560 as the unties are in 100,000.


In [12]:
# Get feature names and their corresponding coefficients
feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})

# Sort features by absolute value of coefficient to see strongest impacts
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)

# Display the sorted feature importance
print(feature_importance[['Feature', 'Coefficient']])

      Feature  Coefficient
3   AveBedrms     0.783145
0      MedInc     0.448675
7   Longitude    -0.433708
6    Latitude    -0.419792
2    AveRooms    -0.123323
1    HouseAge     0.009724
5    AveOccup    -0.003526
4  Population    -0.000002


In [13]:
# Select the three chosen features
selected_features = ['AveBedrms', 'MedInc', 'Longitude']
X_selected = X[selected_features]

# Split into training and test sets (same 80/20 split)
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Train a new linear regression model
model_sel = LinearRegression()
model_sel.fit(X_train_sel, y_train_sel)

# Make predictions on the test set
y_pred_sel = model_sel.predict(X_test_sel)

# Compute new performance metrics
MSE_sel = mean_squared_error(y_test_sel, y_pred_sel)
RMSE_sel = np.sqrt(MSE_sel)
R2_sel = r2_score(y_test_sel, y_pred_sel)

print("\nSimplified Model Performance:")
print(f"Mean Squared Error (MSE): {MSE_sel:.4f}")
print(f"Root Mean Squared Error (RMSE): {RMSE_sel:.4f}")
print(f"R² Score: {R2_sel:.4f}")



Simplified Model Performance:
Mean Squared Error (MSE): 0.7071
Root Mean Squared Error (RMSE): 0.8409
R² Score: 0.4604


I chose the three models Avg. Bedrooms, Income, and Longitude because they were shown to have the most influence on the predictions and I (wrongly) predicted that it would have more reliable results when it came to variance. It yielded the R² Score lower than the original, meaning that this model does capture some of the important relationships, at least eh most impactful ones, more of the variables povide a more compelte picture. I would not use the simplified model in practice because I would want a more accurate predictor which more coefficents is evidenced to lend itself to. 

In [14]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler and apply it to the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split the scaled data
X_train_scaled, X_test_scaled, _, _ = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model on scaled data
lin_reg_scaled = LinearRegression()
lin_reg_scaled.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_scaled = lin_reg_scaled.predict(X_test_scaled)

# Evaluate model performance
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)
rmse_scaled = RMSE_sel(y_test, y_pred_sel)

print("\nScaled Data Model:")
print(f"Mean Squared Error: {mse_scaled:.2f}")
print(f"Root Mean Squared Error: {rmse_scaled:.2f}")
print(f"R² Score: {r2_scaled:.2f}")
print("Model Coefficients (Scaled):")
print(pd.Series(lin_reg_scaled.coef_, index=X.columns))

TypeError: 'numpy.float64' object is not callable