In [108]:
import torch
import torch.nn as nn
from torch.optim import Adam, RMSprop
from torch.utils.data import TensorDataset, DataLoader

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [135]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [137]:
from sklearn.preprocessing import LabelEncoder

In [138]:
le = LabelEncoder()

In [139]:
train['propertyType'] = le.fit_transform(train['propertyType'])
test['propertyType'] = le.transform(test['propertyType'])

In [140]:
def convert_SN(x) :
    
    if x == 'West Delhi' :
        return 'Delhi West'
    elif x == 'North Delhi' :
        return 'Delhi North'
    else :
        return x

In [141]:
train['suburbName'] = train['suburbName'].apply(convert_SN)
test['suburbName'] = test['suburbName'].apply(convert_SN)

In [142]:
train['suburbName'] = le.fit_transform(train['suburbName'])
test['suburbName'] = le.transform(test['suburbName'])

In [143]:
dist_cols = train.columns[train.columns.str.contains('distance')]

In [144]:
train['dist_skew'] = train[dist_cols].skew(axis = 1)
test['dist_skew'] = test[dist_cols].skew(axis = 1)

In [145]:
from sklearn.cluster import KMeans

In [146]:
km = KMeans(n_clusters = 8, random_state = 42)

In [147]:
train['dist_cls'] = km.fit_predict(train[dist_cols])
test['dist_cls'] = km.predict(test[dist_cols])

In [148]:
km = KMeans(n_clusters = 6, random_state = 42)

In [149]:
infra_cols = ['propertyType', 'bedrooms', 'area(square_meters)']

In [150]:
train['infra_cls'] = km.fit_predict(train[infra_cols])
test['infra_cls'] = km.predict(test[infra_cols])

In [151]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

In [152]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [319]:
from sklearn.preprocessing import StandardScaler

In [320]:
ss = StandardScaler()

In [321]:
X = train.drop(['ID', 'monthlyRent(us_dollar)'], axis = 1)

In [322]:
scaled_X = ss.fit_transform(X)

In [323]:
X = torch.tensor(scaled_X, dtype = torch.float)
y = torch.tensor(train['monthlyRent(us_dollar)'], dtype = torch.float)

In [325]:
target = ss.transform(test.iloc[:, 1:])

In [326]:
target = torch.tensor(target, dtype = torch.float)

In [327]:
from torch.utils.data import TensorDataset, DataLoader

In [354]:
class Net(nn.Module) :
    
    def __init__(self) :
        super(Net, self).__init__()
        self.layer_1 = nn.Linear(13, 128)
        self.bn_1 = nn.BatchNorm1d(128)
        self.relu_1 = nn.LeakyReLU()
        
        
        self.layer_3 = nn.Linear(128, 32)
        self.bn_3 = nn.BatchNorm1d(32)
        self.relu_3 = nn.LeakyReLU()
        
        self.layer_4 = nn.Linear(32, 1)
    def forward(self, x) :
        
        x = self.layer_1(x)
        x = self.bn_1(x)
        x = self.relu_1(x)
        #x = self.layer_2(x)
        #x = self.bn_2(x)
        #x = self.relu_2(x)
        x = self.layer_3(x)
        x = self.bn_3(x)
        x = self.relu_3(x)
        x = self.layer_4(x)
        return x

In [355]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [356]:
criterion = nn.L1Loss()

In [357]:
import torch.optim as optim
from tqdm import tqdm_notebook

In [358]:
%%time

nn_pred = np.zeros((len(test), ))
final_mae = 0

for i, (tr_idx, val_idx) in enumerate(tqdm_notebook(kf.split(X, y))) :
    
    tr_x, tr_y = X[tr_idx], y[tr_idx]
    val_x, val_y = X[val_idx], y[val_idx]
    
    tr_loader = DataLoader(TensorDataset(tr_x, tr_y), shuffle = True, drop_last = True, batch_size = 32)
    val_loader = DataLoader(TensorDataset(val_x, val_y), shuffle = False, drop_last = False, batch_size = 32)
    te_loader = DataLoader(TensorDataset(target, torch.zeros((target.shape[0], ), dtype = torch.float32)), shuffle = False, drop_last = False, batch_size = 32)
    
    model = Net().to(device)
    adam = optim.Adam(model.parameters(), lr = 0.003)
    
    score_standard = 400
    patience = [0] * 7
    
    print(f"Fold {i + 1}...!")
    
    for epoch in tqdm_notebook(range(50)) :
        
        model.train()
        tr_loss = 0
        
        for idx, (xx, yy) in enumerate(tr_loader) :

            xx, yy = xx.to(device), yy.to(device)
            adam.zero_grad()
            pred = model(xx).squeeze()
            
            loss = criterion(pred, yy)
            loss.backward()
            tr_loss += loss.item() / len(tr_loader)

            adam.step()
            
        with torch.no_grad() :
            
            model.eval()
            
            val_loss = 0
            
            predictions = []
            actuals = []
            
            for xx, yy in val_loader :
                
                xx, yy = xx.to(device), yy.to(device)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                val_loss += loss.item()
                
                predictions += pred.tolist()
                actuals += yy.tolist()
                
            score = mean_absolute_error(actuals, predictions)
            val_loss /= len(val_loader)
            
            if score < score_standard :
                
                score_standard = score
                patience.append(0)
                
                fold_pred = []
                
                for idx, (xx, _) in enumerate(te_loader) :
                    
                    xx = xx.to(device)
                    pred = model(xx).squeeze() / kf.n_splits
                    fold_pred += pred.tolist()
            else :
                patience.append(1)
                
            if patience[-7:] == [1] * 7 :
                
                print(f'Early Stopping...')
                break
                
    print(f"{i + 1} Fold Best Val MAE : {round(score_standard, 4)}\n")
    final_mae += score_standard / kf.n_splits
    nn_pred += fold_pred
print(f"\nNN AVG of MAE : {final_mae}")

0it [00:00, ?it/s]

Fold 1...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
1 Fold Best Val MAE : 64.8775

Fold 2...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
2 Fold Best Val MAE : 60.8053

Fold 3...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
3 Fold Best Val MAE : 61.2306

Fold 4...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
4 Fold Best Val MAE : 55.2893

Fold 5...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
5 Fold Best Val MAE : 57.9369

Fold 6...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
6 Fold Best Val MAE : 63.817

Fold 7...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
7 Fold Best Val MAE : 64.6542

Fold 8...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
8 Fold Best Val MAE : 62.4968

Fold 9...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
9 Fold Best Val MAE : 60.747

Fold 10...!


  0%|          | 0/50 [00:00<?, ?it/s]

Early Stopping...
10 Fold Best Val MAE : 62.8633


NN AVG of MAE : 61.47180096809251
CPU times: user 1min 3s, sys: 8 s, total: 1min 11s
Wall time: 1min 8s


In [294]:
submission = pd.read_csv('sample_submission.csv')

In [334]:
submission['monthlyRent(us_dollar)'] = [int(round(v, 0)) for v in nn_pred]

In [336]:
submission.to_csv('nn.csv', index = False)