<a href="https://colab.research.google.com/github/insoucyant/MachineLearning/blob/main/PricePredictiounUsingPyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d camnugent/california-housing-prices
!unzip california-housing-prices.zip

Dataset URL: https://www.kaggle.com/datasets/camnugent/california-housing-prices
License(s): CC0-1.0
Archive:  california-housing-prices.zip
  inflating: housing.csv             


In [2]:
# https://www.kaggle.com/code/realshivarv/simple-housing-price-predictor-using-pytorch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
# What is teh difference between Label Encoder and Onehotencoder?
import torch
from torch.utils.data import TensorDataset, DataLoader

In [3]:
housing_data = pd.read_csv('housing.csv')
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
print(housing_data.shape)
print(housing_data.info())
for i in housing_data.columns:
  print(f'Column name is: {i} and it has {len(housing_data[i].unique()) }unique values')

(20640, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
Column name is: longitude and it has 844unique values
Column name is: latitude and it has 862unique values
Column name is: housing_median_age and it has 52unique values
Column name is: total_rooms and it has 5926unique values
Column n

Categorical Variables

In [5]:
pd.get_dummies(housing_data['ocean_proximity']).head()
# housing_data_OHE = pd.concat([housing_data, pd.get_dummies(housing_data['ocean_proximity'])], axis=1)
# housing_data_OHE.drop('ocean_proximity', axis=1, inplace=True)
# housing_data_OHE

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,False,False,False,True,False
1,False,False,False,True,False
2,False,False,False,True,False
3,False,False,False,True,False
4,False,False,False,True,False


In [6]:
cont_features =['housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
cat_features = ['ocean_proximity']
y_feature = ['median_house_value']
X = housing_data[cont_features + cat_features]
y = housing_data[y_feature]

# Price Normalizer
price_normalizer = StandardScaler().fit(y.to_numpy().reshape(-1,1))
y_norm = price_normalizer.transform(y.to_numpy().reshape(-1,1))
X_train, X_eval, y_train, y_eval = train_test_split(X, y_norm, test_size=0.2, random_state=0)

In [7]:
# Set Hyperparmeters
EPOCHS, BATCH_SIZE, LEARNING_RATE, L1_LAMBDA = 1000, 16, 0.01, 0.01
torch.random.manual_seed(0)

<torch._C.Generator at 0x7bd8f96a2830>

In [8]:
# Build data preprocessing pipeline
pytorch_model_preprocessing_pipeline = Pipeline([
    ('transformer', ColumnTransformer ([
        ('onehot', OneHotEncoder(), cat_features),
        ('normalizer', StandardScaler(), cont_features)
    ], remainder='passthrough'))
])

In [9]:
# Preprocess both training and evaluation data
X_train_torch = pytorch_model_preprocessing_pipeline.fit_transform(X_train)
X_eval_torch = pytorch_model_preprocessing_pipeline.transform(X_eval)

# Pack all the data and price arrays to pytorch tensorsets
train_tensorset = TensorDataset(
    torch.tensor(X_train_torch, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

eval_tensorset = TensorDataset(
    torch.tensor(X_eval_torch, dtype=torch.float32),
    torch.tensor(y_eval, dtype=torch.float32)
)



# Compile the tensorsets into Pythorch DataLoaders
train_tensordataset = DataLoader(train_tensorset, batch_size=BATCH_SIZE, shuffle=True)
eval_tensorset = DataLoader(eval_tensorset, batch_size=BATCH_SIZE, shuffle=False)

# Build FFN Neural Net

In [None]:
# Build Neural Net Model Class
class NeuralNet(nn.Module):
  def __init__(self, input_size, **kwargs):
    super().__init__(**kwargs)
    self.input_layer = nn.Linear(input_size, 16)
    self.hiddenLayers = nn.ModuleDict({
        'hlayer1': nn.Linear(16,32),
        'hlayer2': nn.Linear(32,64),
        'hlayer3': nn.Linear(64,32),
        'hlayer4': nn.Linear(32,16),
        'hlayer5': nn.Linear(16,8)
    })
    self.outputLayer = nn.Linear(8,1)

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    x = F.relu(self.inputLayer(x))
    for h_layer in self.hiddenLayers:
      x = F.relu(self.hiddenLayers[h_layer](x))
    x = self.outputLayer(x)

    return x

  @staticmethod





