In [1]:
import numpy as np
import pandas as pd
import os

from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transforms

from sklearn.linear_model import LinearRegression

In [2]:
train_df = pd.read_csv("../data/train_with_images.csv")
X_cnn_train = np.load("../data/cnn_features.npy")

X_tab_train = train_df[
    ["bedrooms", "bathrooms", "sqft_living", "lat", "long"]
].values

y_train = train_df["price"].values

X_train_combined = np.hstack([X_tab_train, X_cnn_train])

X_train_combined.shape, y_train.shape

((21, 2053), (21,))

In [3]:
model = LinearRegression()
model.fit(X_train_combined, y_train)

In [4]:
test_df = pd.read_excel("../data/test2.xlsx")
test_df.shape

(5404, 20)

In [5]:
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval()

Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [6]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [7]:
existing_images = set(
    f.replace(".png", "") for f in os.listdir("../data/test_satellite_images")
)

test_df = test_df[
    test_df["id"].astype(str).isin(existing_images)
].reset_index(drop=True)

test_df.shape

(983, 20)

In [8]:
features = []

for img_id in test_df["id"]:
    path = f"../data/test_satellite_images/{img_id}.png"
    img = Image.open(path).convert("RGB")
    img_t = transform(img).unsqueeze(0)

    with torch.no_grad():
        feat = resnet(img_t).squeeze().numpy()

    features.append(feat)

X_cnn_test = np.array(features)
X_cnn_test.shape

(983, 2048)

In [9]:
X_tab_test = test_df[
    ["bedrooms", "bathrooms", "sqft_living", "lat", "long"]
].values

X_test_combined = np.hstack([X_tab_test, X_cnn_test])
X_test_combined.shape

(983, 2053)

In [10]:
test_df["predicted_price"] = model.predict(X_test_combined)
test_df[["id", "predicted_price"]].head()

Unnamed: 0,id,predicted_price
0,2591820310,317648.9
1,7974200820,611448.6
2,7701450110,709483.7
3,9522300010,1139926.0
4,9510861140,662731.6


In [11]:
test_df["predicted_price"] = test_df["predicted_price"].clip(lower=0)

In [12]:
test_df[["id", "predicted_price"]].to_csv(
    "../data/enrollno_final.csv",
    index=False
)

In [13]:
import pandas as pd

df = pd.read_csv("../data/enrollno_final.csv")
df.head()

Unnamed: 0,id,predicted_price
0,2591820310,317648.9
1,7974200820,611448.6
2,7701450110,709483.7
3,9522300010,1139926.0
4,9510861140,662731.6


In [14]:
df.describe()

Unnamed: 0,id,predicted_price
count,983.0,983.0
mean,4524858000.0,455241.9
std,2873154000.0,262156.5
min,16000020.0,0.0
25%,2125554000.0,266741.0
50%,3856901000.0,427066.5
75%,7227651000.0,600648.9
max,9839301000.0,1637295.0
