<a href="https://colab.research.google.com/github/hikmatfarhat-ndu/pytorch/blob/main/custom_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import pandas as pd
import numpy as np

In [2]:
!git clone https://github.com/emanhamed/Houses-dataset

Cloning into 'Houses-dataset'...
remote: Enumerating objects: 2166, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 2166 (delta 0), reused 0 (delta 0), pack-reused 2165[K
Receiving objects: 100% (2166/2166), 176.26 MiB | 27.71 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [39]:
df=pd.read_csv("Houses-dataset/Houses Dataset/HousesInfo.txt",header=None,delim_whitespace=True,
               names=["bedrooms","bathrooms","size","zipcode","price"])

In [40]:
type(df)

pandas.core.frame.DataFrame

In [41]:
def cleanData(df):
    # compute the number of entries per zipcode
    zipcodes=df['zipcode'].value_counts().keys().tolist()
    counts=df['zipcode'].value_counts().tolist()
    #discard all zipcodes ocurring less than 20 times
    for count,zipcode in zip(counts,zipcodes):
      if count<20:
        idx=df[df['zipcode']==zipcode].index
        df.drop(idx,inplace=True)
    return df

In [42]:
dataset=cleanData(df)

In [43]:
type(dataset)

pandas.core.frame.DataFrame

In [44]:
dataset['price'].max()

5858000

In [45]:
import shutil
from_prefix="Houses-dataset/Houses Dataset/"
to_prefix="images/"
suffix="_frontal.jpg"
!mkdir -p images
for newidx,oldidx in enumerate(df.index.tolist()):
  oldname=from_prefix+str(oldidx)+suffix
  newname=to_prefix+str(newidx)+suffix
  #print("moving from %s to %s"%(oldname,newname))
  shutil.copy(oldname,newname)

In [46]:
dataset.to_csv("cleansedDataset.csv",index=False)

In [47]:
dataset['price']

30     789000
32     365000
39     455000
80     599000
81     529800
        ...  
530    399900
531    460000
532    407000
533    419000
534    615000
Name: price, Length: 384, dtype: int64

In [53]:
class CustomDataset(Dataset):
  def __init__(self,csvFile,imgDir):
    dataset=pd.read_csv(csvFile)
    dummy=pd.get_dummies(dataset['zipcode'])
    price=dataset['price']
    self.max_price=price.max()
    df=dataset.drop(['price','zipcode'],axis=1)
    self.data=pd.concat([df,dummy,price],axis=1)
  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    #
    return self.data.iloc[idx,:-1].to_numpy(dtype=np.float32),np.float32(self.data.iloc[idx,-1]/self.max_price)

In [54]:
dataset=CustomDataset("cleansedDataset.csv","images")

In [55]:
dataset.data

Unnamed: 0,bedrooms,bathrooms,size,91901,92276,92677,92880,93446,93510,94501,94531,price
0,5,3.0,2520,0,0,0,0,1,0,0,0,789000
1,3,2.0,1802,0,0,0,0,1,0,0,0,365000
2,3,3.0,2146,0,0,0,0,1,0,0,0,455000
3,4,2.5,2464,1,0,0,0,0,0,0,0,599000
4,2,2.0,1845,1,0,0,0,0,0,0,0,529800
...,...,...,...,...,...,...,...,...,...,...,...,...
379,5,2.0,2066,0,0,0,0,0,0,0,1,399900
380,4,3.5,9536,0,0,0,0,0,0,0,1,460000
381,3,2.0,2014,0,0,0,0,0,0,0,1,407000
382,4,3.0,2312,0,0,0,0,0,0,0,1,419000


In [56]:
train_loader=DataLoader(dataset,batch_size=32,shuffle=True)

In [57]:
itr=iter(train_loader)

In [58]:
x=next(itr)

In [59]:
len(x)

2

In [60]:
x[1].size()

torch.Size([32])

In [65]:
class Net(nn.Module):
  def __init__(self):
    super().__init__()
    self.relu=nn.ReLU()
    self.fc1=nn.Linear(in_features=11,out_features=32)
    self.fc2=nn.Linear(in_features=32,out_features=16)
    self.fc3=nn.Linear(in_features=16,out_features=1)
  def forward(self,x):
    x=self.fc1(x)
    x=self.relu(x)
    x=self.fc2(x)
    x=self.relu(x)
    x=self.fc3(x)
    return x

In [66]:
model=Net()
x[0].size()

torch.Size([32, 11])

In [67]:
from torch.optim import SGD
from torch.nn import MSELoss


In [70]:
#TODO the loss starts off HUGE why?
model=Net()
optimizer=SGD(model.parameters(),lr=0.001,momentum=0.9)
loss_fn=MSELoss()
epochs=100
for epoch in range(epochs):
  for input,price in train_loader:
    output=model(input)
    optimizer.zero_grad()
    loss=loss_fn(output.squeeze(),price)
    loss.backward()
    optimizer.step()
    
  print(loss)
  



tensor(9844.6699, grad_fn=<MseLossBackward0>)
tensor(26752.8574, grad_fn=<MseLossBackward0>)
tensor(23782.1836, grad_fn=<MseLossBackward0>)
tensor(15797.3066, grad_fn=<MseLossBackward0>)
tensor(9314.6416, grad_fn=<MseLossBackward0>)
tensor(5190.4106, grad_fn=<MseLossBackward0>)
tensor(2813.0498, grad_fn=<MseLossBackward0>)
tensor(1502.1604, grad_fn=<MseLossBackward0>)
tensor(794.1629, grad_fn=<MseLossBackward0>)
tensor(419.7673, grad_fn=<MseLossBackward0>)
tensor(219.6024, grad_fn=<MseLossBackward0>)
tensor(116.1132, grad_fn=<MseLossBackward0>)
tensor(60.9112, grad_fn=<MseLossBackward0>)
tensor(32.3119, grad_fn=<MseLossBackward0>)
tensor(16.8330, grad_fn=<MseLossBackward0>)
tensor(8.8459, grad_fn=<MseLossBackward0>)
tensor(4.6676, grad_fn=<MseLossBackward0>)
tensor(2.4499, grad_fn=<MseLossBackward0>)
tensor(1.2700, grad_fn=<MseLossBackward0>)
tensor(0.6529, grad_fn=<MseLossBackward0>)
tensor(0.3650, grad_fn=<MseLossBackward0>)
tensor(0.1974, grad_fn=<MseLossBackward0>)
tensor(0.1024, g