In [76]:
# import pytorch libraries
%matplotlib inline
import numpy as np
import pickle
import pandas as pd
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
import spacy
import matplotlib.pyplot as plt


In [77]:
pickle_file_path = 'project-group-6/train.pickle'

In [78]:
with open(pickle_file_path, 'rb') as file:
    # Load the data from the file
    training_data = pickle.load(file)
training_data = pd.DataFrame(training_data)
training_data.head(5)

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,320.000 €,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...
1,335.000 €,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,330.000 €,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de..."
3,435.000 €,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,410.000 €,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin..."


In [79]:
test_pickle_file_path = 'project-group-6/test_kaggle.pickle'

with open(test_pickle_file_path, 'rb') as file:
    test_data = pickle.load(file)
test_data = pd.DataFrame(test_data)
# test_data['price'] = "0.00 €"
test_data.insert(0, 'price', "0.00 €")
test_data = test_data.drop(columns=['id', 'description'])
test_data.head(5)

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,0.00 €,Piso Carrer de llull. Piso con 4 habitaciones ...,Barcelona - El Parc i la Llacuna del Poblenou,,"[87 m2, 4 hab., 1 baño]",FLAT,FLAT,SECOND_HAND,Contactar con Camila 7. 3.\n\nLa Casa Agency E...
1,0.00 €,Piso Diagonal. Luminoso piso de 4 habitaciones...,Barcelona - Poblenou,,"[78 m2, 4 hab., 1 baño]",FLAT,FLAT,SECOND_HAND,¡Un gran piso a reformar es una gran oportunid...
2,0.00 €,Piso Carrer del consell de cent. Piso amueblad...,Barcelona - L´Antiga Esquerra de l´Eixample,,"[65 m2, 1 hab., 1 baño]",FLAT,FLAT,SECOND_HAND,"AUREA INMOBILIARIA PRESENTA, ACOGEDOR APARTAME..."
3,0.00 €,Piso Castanys. Carrer castanys,Barcelona - Poblenou,,"[88 m2, 3 hab., 1 baño]",FLAT,FLAT,SECOND_HAND,"Piso en pleno centro de Poblenou, techos altos..."
4,0.00 €,Piso Carrer de casanova. Piso con 2 habitacion...,Barcelona - Sant Antoni,,"[82 m2, 2 hab., 1 baño]",FLAT,FLAT,SECOND_HAND,Punt Zona Franca presenta esta fantástica vivi...


In [80]:
training_data = pd.concat([training_data, test_data], ignore_index=True)

In [81]:
training_data['type'].unique()

array(['FLAT', 'STUDIO', 'GROUND_FLOOR', 'PENTHOUSE', 'APARTMENT', 'LOFT',
       'DUPLEX'], dtype=object)

In [82]:
training_data['selltype'].unique()

array(['SECOND_HAND'], dtype=object)

In [83]:
dummies = pd.get_dummies(training_data['type'])
training_data
training_data_with_dummies = pd.concat([training_data, dummies], axis=1)
training_data_with_dummies.drop('type', axis=1, inplace=True)

In [84]:
row = training_data_with_dummies['features'][0]

In [85]:
print(row)

['85 m2', '2 hab.', '1 baño', '3.647 €/m2']


In [86]:
training_data.head(5)

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,320.000 €,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...
1,335.000 €,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,330.000 €,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de..."
3,435.000 €,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,410.000 €,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin..."


In [87]:

def clean_features(lst):
    # Initialize all variables with NaN to handle missing values
    square_m = hab = bano = np.nan
    
    # Process each element in the list if it exists
    if len(lst) > 0 and lst[0]:
        square_m, _ = lst[0].split(' ')
        square_m = float(square_m)
    
    if len(lst) > 1 and lst[1]:
        hab, _ = lst[1].split(' ')
        hab = float(hab)
    
    if len(lst) > 2 and lst[2]:
        # Check if 'bano' information is available or it's the price per square meter
        if 'baño' in lst[2]:
            bano, _ = lst[2].split(' ')
            bano = float(bano)
        # else:
        #     # If 'baño' is not in the string, it might be the price info in the 'bano' slot
        #     price_per_s_m, _ = lst[2].split(' ')
        #     price_per_s_m = float(price_per_s_m)
    
    # if len(lst) > 3 and lst[3]:
    #     price_per_s_m, _ = lst[3].split(' ')
    #     price_per_s_m = float(price_per_s_m)
    
    return square_m, hab, bano


In [88]:
def fix_euros(element):
    price, _ = element.split(' ')
    return float(price)

In [89]:
square_m, hab, bano = clean_features(row)

In [90]:
square_m_list , hab_list, bano_list, price_list= [], [], [], []
for index, row in enumerate(training_data_with_dummies['features']):
    square_m , hab, bano =clean_features(row)
    square_m_list.append(square_m)
    hab_list.append(hab)
    bano_list.append(bano)
    # price_per_s_m_list.append(price_per_s_m)
for row in training_data_with_dummies['price']:
    price = fix_euros(row)
    price_list.append(price)

In [91]:
training_data_with_dummies['price'] = price_list
training_data_with_dummies['square_m'] = square_m_list
training_data_with_dummies['hab'] = hab_list
training_data_with_dummies['bano'] = bano_list
# training_data_with_dummies['price_per_sm'] = price_per_s_m_list
training_data_with_dummies.drop('features', axis=1, inplace=True)
training_data_with_dummies.drop('selltype', axis=1, inplace=True)

training_data_with_dummies.head()

Unnamed: 0,price,title,loc_string,loc,subtype,desc,APARTMENT,DUPLEX,FLAT,GROUND_FLOOR,LOFT,PENTHOUSE,STUDIO,square_m,hab,bano
0,320.0,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,FLAT,Piso en última planta a reformar en calle Tall...,0,0,1,0,0,0,0,85.0,2.0,1.0
1,335.0,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,FLAT,"Ubicado en la zona del Camp de l’Arpa, cerca d...",0,0,1,0,0,0,0,65.0,2.0,1.0
2,330.0,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,FLAT,"En pleno centro de Barcelona, justo al lado de...",0,0,1,0,0,0,0,77.0,2.0,1.0
3,435.0,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,FLAT,"Vivienda espaciosa en Sant Antoni, cerca de Pl...",0,0,1,0,0,0,0,96.0,3.0,2.0
4,410.0,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,FLAT,"En el corazón de Barcelona, en una hermosa fin...",0,0,1,0,0,0,0,84.0,2.0,1.0


In [92]:
# def citys_neighbhoods(text):
#     sliced = text.split(' - ') 
#     city = [sliced[0]]
#     neighborhoods = sliced[1]
#     return city, neighborhoods

In [93]:
df = pd.DataFrame()

In [94]:
# city_list, neighborhoods_list = [], []
# for i , row in enumerate(training_data_with_dummies['loc_string']):
#     city, neighborhoods, = citys_neighbhoods(row)
#     city_list.append(city)
#     neighborhoods_list.append(neighborhoods)
# df['city'] = pd.DataFrame(city_list)
# df['neighborhood'] = neighborhoods_list


In [95]:
# df['City_encoded'] = df['city'].astype('category').cat.codes
# df['Neighborhood_encoded'] = df['neighborhood'].astype('category').cat.codes
# training_data_with_dummies['City_encoded'] = df['City_encoded'] 
# training_data_with_dummies['Neighborhood_encoded'] = df['Neighborhood_encoded']
# train_set = pd.get_dummies(training_data_with_dummies, columns=['City_encoded', 'Neighborhood_encoded'])
# train_set.head(5)

In [96]:
# train_set.drop('loc_string', axis=1, inplace=True)
# train_set.drop('title', axis=1, inplace=True)
# train_set.drop('loc', axis=1, inplace=True)
# train_set.drop('desc', axis=1, inplace=True)
# train_set.drop('subtype', axis=1, inplace=True)
train_set = training_data_with_dummies
train_set.shape

(998, 16)

In [97]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         998 non-null    float64
 1   title         998 non-null    object 
 2   loc_string    998 non-null    object 
 3   loc           103 non-null    object 
 4   subtype       998 non-null    object 
 5   desc          998 non-null    object 
 6   APARTMENT     998 non-null    uint8  
 7   DUPLEX        998 non-null    uint8  
 8   FLAT          998 non-null    uint8  
 9   GROUND_FLOOR  998 non-null    uint8  
 10  LOFT          998 non-null    uint8  
 11  PENTHOUSE     998 non-null    uint8  
 12  STUDIO        998 non-null    uint8  
 13  square_m      998 non-null    float64
 14  hab           998 non-null    float64
 15  bano          988 non-null    float64
dtypes: float64(4), object(5), uint8(7)
memory usage: 77.1+ KB


In [98]:
df = train_set
train_set.shape

(998, 16)

In [99]:
train_set.columns

Index(['price', 'title', 'loc_string', 'loc', 'subtype', 'desc', 'APARTMENT',
       'DUPLEX', 'FLAT', 'GROUND_FLOOR', 'LOFT', 'PENTHOUSE', 'STUDIO',
       'square_m', 'hab', 'bano'],
      dtype='object')

In [100]:
train_set['bano'].unique()

array([ 1.,  2., nan,  3.,  4.])

In [101]:
columns_with_nan = df.columns[df.isna().any()].tolist()
print(columns_with_nan)

['loc', 'bano']


In [102]:
df = df.drop(columns=['loc'])

In [103]:
nan_count_banos = df['bano'].isna().sum()
print(f'Number of NaN values in "banos": {nan_count_banos}')
df['bano'] = df['bano'].fillna(0)

Number of NaN values in "banos": 10


In [104]:
df.shape

(998, 15)

In [105]:
df = df.dropna()

In [106]:
df.shape

(998, 15)

In [107]:
df['subtype'].isna().sum()

0

In [108]:
df.head(5)

Unnamed: 0,price,title,loc_string,subtype,desc,APARTMENT,DUPLEX,FLAT,GROUND_FLOOR,LOFT,PENTHOUSE,STUDIO,square_m,hab,bano
0,320.0,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,FLAT,Piso en última planta a reformar en calle Tall...,0,0,1,0,0,0,0,85.0,2.0,1.0
1,335.0,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,FLAT,"Ubicado en la zona del Camp de l’Arpa, cerca d...",0,0,1,0,0,0,0,65.0,2.0,1.0
2,330.0,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,FLAT,"En pleno centro de Barcelona, justo al lado de...",0,0,1,0,0,0,0,77.0,2.0,1.0
3,435.0,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,FLAT,"Vivienda espaciosa en Sant Antoni, cerca de Pl...",0,0,1,0,0,0,0,96.0,3.0,2.0
4,410.0,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,FLAT,"En el corazón de Barcelona, en una hermosa fin...",0,0,1,0,0,0,0,84.0,2.0,1.0


In [109]:
df['subtype'].unique()

array(['FLAT', 'STUDIO', 'GROUND_FLOOR', 'PENTHOUSE', 'APARTMENT', 'LOFT',
       'DUPLEX'], dtype=object)

In [110]:
df['loc_string'].unique()

array(['Barcelona - Sant Antoni', 'Barcelona - Dreta de l´Eixample',
       'Barcelona - Sagrada Família', 'Barcelona - Fort Pienc',
       'Barcelona - L´Antiga Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - Dreta de l´Eixample\nVer mapa',
       'Barcelona - Poblenou',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - Poblenou\nVer mapa',
       'Barcelona - El Camp de l´Arpa del Clot',
       'Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Provençals del Poblenou', 'Barcelona - El Clot',
       'Barcelona - Navas', 'Barcelona - Sagrada Família\nVer mapa'],
      dtype=object)

## Continuing encoding subtype and loc_string

In [111]:
df = pd.get_dummies(df, columns=['subtype', 'loc_string'])

In [112]:
df2 = df
df2

Unnamed: 0,price,title,desc,APARTMENT,DUPLEX,FLAT,GROUND_FLOOR,LOFT,PENTHOUSE,STUDIO,...,loc_string_Barcelona - La Nova Esquerra de l´Eixample\nVer mapa,loc_string_Barcelona - La Vila Olímpica del Poblenou,loc_string_Barcelona - L´Antiga Esquerra de l´Eixample,loc_string_Barcelona - Navas,loc_string_Barcelona - Poblenou,loc_string_Barcelona - Poblenou\nVer mapa,loc_string_Barcelona - Provençals del Poblenou,loc_string_Barcelona - Sagrada Família,loc_string_Barcelona - Sagrada Família\nVer mapa,loc_string_Barcelona - Sant Antoni
0,320.0,Piso Tallers. Piso con 2 habitaciones con asce...,Piso en última planta a reformar en calle Tall...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,335.0,Piso C/ de valència. Piso reformado en venta d...,"Ubicado en la zona del Camp de l’Arpa, cerca d...",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,330.0,Piso en Dreta de l´Eixample. Acogedor piso al ...,"En pleno centro de Barcelona, justo al lado de...",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,435.0,"Piso Barcelona - corts catalanes. Soleado, cén...","Vivienda espaciosa en Sant Antoni, cerca de Pl...",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,410.0,"Piso en Carrer de sardenya 271. Alto, reformad...","En el corazón de Barcelona, en una hermosa fin...",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0.0,Piso todo exterior con balcones en sant antoni,Encantador piso alto todo exterior con 2 balco...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,0.0,Piso Avila,Estupenda oportunidad en el Parc i La Llacuna ...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
995,0.0,Piso en Sagrada Família. Piso carrer de mallorca,¡OPORTUNIDAD EN SAGRADA FAMILIA!\n\nPiso situa...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
996,0.0,Apartamento en Poblenou. Apartamento con 3 hab...,"Piso en venta, Barcelona, Poblenou, carrer del...",1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [113]:
df2.drop('title', axis=1, inplace=True)
df2.drop('desc', axis=1, inplace=True)

In [114]:
df2.shape

(998, 37)

In [115]:
df2.head(5)

Unnamed: 0,price,APARTMENT,DUPLEX,FLAT,GROUND_FLOOR,LOFT,PENTHOUSE,STUDIO,square_m,hab,...,loc_string_Barcelona - La Nova Esquerra de l´Eixample\nVer mapa,loc_string_Barcelona - La Vila Olímpica del Poblenou,loc_string_Barcelona - L´Antiga Esquerra de l´Eixample,loc_string_Barcelona - Navas,loc_string_Barcelona - Poblenou,loc_string_Barcelona - Poblenou\nVer mapa,loc_string_Barcelona - Provençals del Poblenou,loc_string_Barcelona - Sagrada Família,loc_string_Barcelona - Sagrada Família\nVer mapa,loc_string_Barcelona - Sant Antoni
0,320.0,0,0,1,0,0,0,0,85.0,2.0,...,0,0,0,0,0,0,0,0,0,1
1,335.0,0,0,1,0,0,0,0,65.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,330.0,0,0,1,0,0,0,0,77.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,435.0,0,0,1,0,0,0,0,96.0,3.0,...,0,0,0,0,0,0,0,0,0,1
4,410.0,0,0,1,0,0,0,0,84.0,2.0,...,0,0,0,0,0,0,0,1,0,0


In [116]:
train_df = df2[df2['price'] != 0.0]
test_df = df2[df2['price'] == 0.0]

In [117]:
train_df.columns

Index(['price', 'APARTMENT', 'DUPLEX', 'FLAT', 'GROUND_FLOOR', 'LOFT',
       'PENTHOUSE', 'STUDIO', 'square_m', 'hab', 'bano', 'subtype_APARTMENT',
       'subtype_DUPLEX', 'subtype_FLAT', 'subtype_GROUND_FLOOR',
       'subtype_LOFT', 'subtype_PENTHOUSE', 'subtype_STUDIO',
       'loc_string_Barcelona - Besòs - Maresme',
       'loc_string_Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'loc_string_Barcelona - Dreta de l´Eixample',
       'loc_string_Barcelona - Dreta de l´Eixample\nVer mapa',
       'loc_string_Barcelona - El Camp de l´Arpa del Clot',
       'loc_string_Barcelona - El Clot',
       'loc_string_Barcelona - El Parc i la Llacuna del Poblenou',
       'loc_string_Barcelona - Fort Pienc',
       'loc_string_Barcelona - La Nova Esquerra de l´Eixample',
       'loc_string_Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'loc_string_Barcelona - La Vila Olímpica del Poblenou',
       'loc_string_Barcelona - L´Antiga Esquerra de l´Eixample',
   

In [118]:
train_df.shape, test_df.shape

((866, 37), (132, 37))

In [119]:
train_df

Unnamed: 0,price,APARTMENT,DUPLEX,FLAT,GROUND_FLOOR,LOFT,PENTHOUSE,STUDIO,square_m,hab,...,loc_string_Barcelona - La Nova Esquerra de l´Eixample\nVer mapa,loc_string_Barcelona - La Vila Olímpica del Poblenou,loc_string_Barcelona - L´Antiga Esquerra de l´Eixample,loc_string_Barcelona - Navas,loc_string_Barcelona - Poblenou,loc_string_Barcelona - Poblenou\nVer mapa,loc_string_Barcelona - Provençals del Poblenou,loc_string_Barcelona - Sagrada Família,loc_string_Barcelona - Sagrada Família\nVer mapa,loc_string_Barcelona - Sant Antoni
0,320.0,0,0,1,0,0,0,0,85.0,2.0,...,0,0,0,0,0,0,0,0,0,1
1,335.0,0,0,1,0,0,0,0,65.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,330.0,0,0,1,0,0,0,0,77.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,435.0,0,0,1,0,0,0,0,96.0,3.0,...,0,0,0,0,0,0,0,0,0,1
4,410.0,0,0,1,0,0,0,0,84.0,2.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,342.0,0,0,1,0,0,0,0,115.0,3.0,...,0,0,0,1,0,0,0,0,0,0
862,315.0,0,0,1,0,0,0,0,82.0,3.0,...,0,0,0,1,0,0,0,0,0,0
863,360.0,0,0,1,0,0,0,0,79.0,4.0,...,0,0,0,1,0,0,0,0,0,0
864,270.0,0,0,1,0,0,0,0,63.0,1.0,...,0,0,0,1,0,0,0,0,0,0


In [120]:
X = train_df.iloc[:, 1:].values 
y = train_df.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [121]:
test_df = test_df.drop('price', axis=1)

In [122]:
test_df

Unnamed: 0,APARTMENT,DUPLEX,FLAT,GROUND_FLOOR,LOFT,PENTHOUSE,STUDIO,square_m,hab,bano,...,loc_string_Barcelona - La Nova Esquerra de l´Eixample\nVer mapa,loc_string_Barcelona - La Vila Olímpica del Poblenou,loc_string_Barcelona - L´Antiga Esquerra de l´Eixample,loc_string_Barcelona - Navas,loc_string_Barcelona - Poblenou,loc_string_Barcelona - Poblenou\nVer mapa,loc_string_Barcelona - Provençals del Poblenou,loc_string_Barcelona - Sagrada Família,loc_string_Barcelona - Sagrada Família\nVer mapa,loc_string_Barcelona - Sant Antoni
866,0,0,1,0,0,0,0,87.0,4.0,1.0,...,0,0,0,0,0,0,0,0,0,0
867,0,0,1,0,0,0,0,78.0,4.0,1.0,...,0,0,0,0,1,0,0,0,0,0
868,0,0,1,0,0,0,0,65.0,1.0,1.0,...,0,0,1,0,0,0,0,0,0,0
869,0,0,1,0,0,0,0,88.0,3.0,1.0,...,0,0,0,0,1,0,0,0,0,0
870,0,0,1,0,0,0,0,82.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0,0,1,0,0,0,0,89.0,3.0,1.0,...,0,0,0,0,0,0,0,0,0,0
994,0,0,1,0,0,0,0,65.0,3.0,1.0,...,0,0,0,0,0,0,0,0,0,0
995,0,0,1,0,0,0,0,75.0,4.0,1.0,...,0,0,0,0,0,0,0,1,0,0
996,1,0,0,0,0,0,0,75.0,3.0,2.0,...,0,0,0,0,1,0,0,0,0,0


In [123]:
X_train

array([[-0.1537956 , -0.1081125 ,  0.26744499, ..., -0.47823281,
         0.        , -0.34065502],
       [-0.1537956 , -0.1081125 ,  0.26744499, ..., -0.47823281,
         0.        , -0.34065502],
       [-0.1537956 , -0.1081125 ,  0.26744499, ..., -0.47823281,
         0.        ,  2.93552107],
       ...,
       [-0.1537956 , -0.1081125 ,  0.26744499, ..., -0.47823281,
         0.        ,  2.93552107],
       [-0.1537956 , -0.1081125 ,  0.26744499, ..., -0.47823281,
         0.        , -0.34065502],
       [-0.1537956 , -0.1081125 ,  0.26744499, ..., -0.47823281,
         0.        , -0.34065502]])

In [124]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=5)

rf_regressor.fit(X_train, y_train)

y_pred = rf_regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')

R-squared: 0.4175
Mean Squared Error: 3167.8763


In [125]:
X_test_new = test_df.values

X_test_new_scaled = scaler.transform(X_test_new)

In [126]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

rf_regressor.fit(X_train, y_train)

scores = cross_val_score(rf_regressor, X_train, y_train, cv=5, scoring='r2')

y_test_new_pred = rf_regressor.predict(X_test_new_scaled)

print("R-squared scores for each fold:", scores)
print("Average R-squared:", np.mean(scores))


R-squared scores for each fold: [0.48517918 0.38064532 0.60397579 0.49901514 0.47277148]
Average R-squared: 0.4883173811910545


In [127]:
y_test_new_pred.shape

(132,)

In [128]:
# Assuming 'df2' is your DataFrame
# Separate features and target
X = df2.drop('price', axis=1).values
y = df2['price'].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (LASSO works better when features are on the same scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the LASSO regression object with an alpha value (regularization strength)
lasso = Lasso(alpha=0.1)

# Fit the model on the training data
lasso.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = lasso.predict(X_test_scaled)

# Calculate the R-squared and Mean Squared Error
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'R-squared: {r2}')
print(f'Mean Squared Error: {mse}')

R-squared: 0.018307136387151313
Mean Squared Error: 18820.654018601417


In [129]:
y_test_new_pred

array([326.39833333, 347.535     , 290.67071429, 382.5684    ,
       317.67233333, 375.28016667, 315.47716667, 238.92      ,
       309.61266667, 296.39804   , 380.6239    , 371.29871429,
       344.7425    , 324.863     , 375.28016667, 253.052     ,
       294.8       , 364.8869339 , 416.69833333, 296.45      ,
       329.4845    , 337.11      , 316.08666667, 429.75208333,
       253.15342857, 267.99583333, 375.28016667, 348.07      ,
       369.76666667, 407.76856667, 381.14433333, 374.09106667,
       368.90165385, 368.        , 423.77279304, 294.47185   ,
       384.72785714, 276.75      , 423.91916667, 341.78666667,
       414.92516667, 368.90165385, 339.287     , 371.6835    ,
       375.28016667, 430.5672    , 337.49486667, 431.45      ,
       383.739     , 429.75208333, 324.119     , 364.8615    ,
       305.064     , 282.6425    , 399.134     , 252.0415    ,
       433.11      , 369.76666667, 344.32166667, 397.99      ,
       278.046     , 362.46163167, 379.5475    , 316.08

In [130]:
class CustomDataset(Dataset):
    """Custom dataset for loading features and targets."""
    def __init__(self, features, targets):
        """
        Initialize the dataset with features and targets.
        
        Args:
            features (DataFrame or np.ndarray or torch.Tensor): The input features.
            targets (DataFrame or np.ndarray or torch.Tensor): The target values.
        """
        if isinstance(features, pd.DataFrame):
            features = features.reset_index(drop=True).values
        if isinstance(targets, pd.DataFrame) or isinstance(targets, pd.Series):
            targets = targets.reset_index(drop=True).values
        
        if isinstance(features, np.ndarray):
            features = torch.from_numpy(features).float()
        if isinstance(targets, np.ndarray):
            targets = torch.from_numpy(targets).float()
        
        self.features = features
        self.targets = targets

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        target = self.targets[idx]
        return feature, target

In [131]:
dataset_train = CustomDataset(features=X_train, targets=y_train)
dataset_test = CustomDataset(features=X_test, targets=y_test)

# Create a DataLoader to handle batching and shuffling
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)

In [132]:
class SimpleNeuralNetwork(torch.nn.Module):

    def __init__(self):
        super().__init__() # First call the constructor for the parent class
        self.dense1 = torch.nn.Linear(36,100)
        self.dense8 = torch.nn.Linear(100,1)


        self.relu = torch.nn.ReLU()
    def forward(self, x): 
        x = self.relu(self.dense1(x))
        x = self.dense8(x)

        return x

In [133]:
class SimpleNeuralNetwork3(torch.nn.Module):

    def __init__(self):
        super().__init__() # First call the constructor for the parent class
        self.dense1 = torch.nn.Linear(36,6)
        self.dense8 = torch.nn.Linear(6,1)


        self.relu = torch.nn.ReLU()
    def forward(self, x): 
        x = self.relu(self.dense1(x))
        x = self.dense8(x)

        return x

In [134]:
class SimpleNeuralNetwork2(torch.nn.Module):

    def __init__(self):
        super().__init__() # First call the constructor for the parent class
        self.dense1 = torch.nn.Linear(36,36)
        self.dense8 = torch.nn.Linear(36,1)


        self.relu = torch.nn.ReLU()
    def forward(self, x): 
        x = self.relu(self.dense1(x))
        x = self.dense8(x)

        return x

In [135]:
criterion = nn.MSELoss()

# Example model
model1 = SimpleNeuralNetwork()
model2 = SimpleNeuralNetwork2()
model3 = SimpleNeuralNetwork3()
optimizer = torch.optim.AdamW(model1.parameters(), lr=.002) 
num_epochs = 400

In [136]:
from sklearn.metrics import r2_score
model = model1
N_train = len(dataset_train)
N_val = len(dataset_test)

avg_loss_per_epoch_train = []
avg_loss_per_epoch_val = []

for ep in range(num_epochs):

    model.train() # Put the model in "training mode"
    total_loss = []

    for x_batch, y_batch in dataloader_train:

    # These lines move the current batch of data to the GPU, if we're using the GPU.


        outputs = model(x_batch)
        outputs = outputs.squeeze()  # This removes any singleton dimensions
# Each item in the current batch is fed into the neural network
        avg_loss_for_this_batch = criterion(outputs, y_batch)

        model.zero_grad() # Wipe the slate clean, preparing for a new gradient calculation
        avg_loss_for_this_batch.backward() # Compute a gradient using the current batch
                                        # A more logical name for this method might be "compute_gradient"

    # Now we take one step of stochastic gradient descent or Adam.
    # (or whichever optimization algorithm we're using)
    # This short line of code updates all of the weights in our neural network.
        optimizer.step() 

    # We can see how powerful PyTorch is now.  In the previous two lines of code,
    # PyTorch did a very complicated gradient calculation for us,
    # shielding us from the details.
    # Then PyTorch updated the neural network weights, again shielding us from the details.
        outputs = model(x_batch)
        loss = r2_score( y_batch.detach().numpy(),outputs.detach().numpy())
        total_loss.append(loss)
    avg_training = np.mean(total_loss) 


    # We just finished one epoch of training.
    # Let's check how well our model is performing on the validation dataset.
    model.eval()
    total_loss = []
    for x_batch, y_batch in dataloader_test:


        with torch.no_grad(): # This line tells PyTorch it doesn't need to worry 
                            # about computing any gradients, for the moment
            outputs = model(x_batch)
            outputs = outputs.squeeze()  # This removes any singleton dimensions

            loss = r2_score( y_batch.detach().numpy(), outputs.detach().numpy())
            total_loss.append(loss)
    avg = np.mean(total_loss)
    
    print('epoch is: ', ep, ' Avg. loss (training data): ', avg_training, ' Avg. loss (validation data): ', avg.item())


epoch is:  0  Avg. loss (training data):  -5.148355745932344  Avg. loss (validation data):  -3.877066814190474
epoch is:  1  Avg. loss (training data):  -3.8437539231858495  Avg. loss (validation data):  -2.5463397177209726
epoch is:  2  Avg. loss (training data):  -2.0267432358654087  Avg. loss (validation data):  -1.5537336922955987
epoch is:  3  Avg. loss (training data):  -0.7543017835790298  Avg. loss (validation data):  -0.33355498616177687
epoch is:  4  Avg. loss (training data):  -0.09473628991178064  Avg. loss (validation data):  -0.05040141633798805
epoch is:  5  Avg. loss (training data):  0.05612348873879301  Avg. loss (validation data):  -0.047182697407507135
epoch is:  6  Avg. loss (training data):  0.023951091273459048  Avg. loss (validation data):  -0.0990797475538635
epoch is:  7  Avg. loss (training data):  0.031603650344674945  Avg. loss (validation data):  -0.04130730612456563
epoch is:  8  Avg. loss (training data):  0.05448103889328755  Avg. loss (validation data)

In [137]:
pickle_file_path = 'project-group-6/test_kaggle.pickle'
with open(pickle_file_path, 'rb') as file:
    # Load the data from the file
    test_data = pickle.load(file)
test_data = pd.DataFrame(test_data)
X_test = test_df.values  # Convert the relevant feature columns to a NumPy array
X_test_tensor = torch.tensor(X_test, dtype=torch.float) 
pred = model(X_test_tensor)
pred1 = pred.detach().numpy()

In [138]:
from sklearn.metrics import r2_score
model = model2
N_train = len(dataset_train)
N_val = len(dataset_test)

avg_loss_per_epoch_train = []
avg_loss_per_epoch_val = []

for ep in range(num_epochs):

    model.train() # Put the model in "training mode"
    total_loss = []

    for x_batch, y_batch in dataloader_train:

    # These lines move the current batch of data to the GPU, if we're using the GPU.


        outputs = model(x_batch)
        outputs = outputs.squeeze()  # This removes any singleton dimensions
# Each item in the current batch is fed into the neural network
        avg_loss_for_this_batch = criterion(outputs, y_batch)

        model.zero_grad() # Wipe the slate clean, preparing for a new gradient calculation
        avg_loss_for_this_batch.backward() # Compute a gradient using the current batch
                                        # A more logical name for this method might be "compute_gradient"

    # Now we take one step of stochastic gradient descent or Adam.
    # (or whichever optimization algorithm we're using)
    # This short line of code updates all of the weights in our neural network.
        optimizer.step() 

    # We can see how powerful PyTorch is now.  In the previous two lines of code,
    # PyTorch did a very complicated gradient calculation for us,
    # shielding us from the details.
    # Then PyTorch updated the neural network weights, again shielding us from the details.
        outputs = model(x_batch)
        loss = r2_score( y_batch.detach().numpy(),outputs.detach().numpy())
        total_loss.append(loss)
    avg_training = np.mean(total_loss) 


    # We just finished one epoch of training.
    # Let's check how well our model is performing on the validation dataset.
    model.eval()
    total_loss = []
    for x_batch, y_batch in dataloader_test:


        with torch.no_grad(): # This line tells PyTorch it doesn't need to worry 
                            # about computing any gradients, for the moment
            outputs = model(x_batch)
            outputs = outputs.squeeze()  # This removes any singleton dimensions

            loss = r2_score( y_batch.detach().numpy(), outputs.detach().numpy())
            total_loss.append(loss)
    avg = np.mean(total_loss)
    
    print('epoch is: ', ep, ' Avg. loss (training data): ', avg_training, ' Avg. loss (validation data): ', avg.item())


epoch is:  0  Avg. loss (training data):  -5.589963077268203  Avg. loss (validation data):  -7.8368581909575425
epoch is:  1  Avg. loss (training data):  -5.566495903756947  Avg. loss (validation data):  -9.666287221668316
epoch is:  2  Avg. loss (training data):  -5.6792456717317075  Avg. loss (validation data):  -4.870285552013639
epoch is:  3  Avg. loss (training data):  -5.636748088362592  Avg. loss (validation data):  -5.599286931789622
epoch is:  4  Avg. loss (training data):  -5.285063565488253  Avg. loss (validation data):  -4.824305523176963
epoch is:  5  Avg. loss (training data):  -5.6556723339072095  Avg. loss (validation data):  -6.911743312860092
epoch is:  6  Avg. loss (training data):  -5.494496242575103  Avg. loss (validation data):  -4.871034360616192
epoch is:  7  Avg. loss (training data):  -5.926229439788617  Avg. loss (validation data):  -8.030767845935399
epoch is:  8  Avg. loss (training data):  -5.762673551392588  Avg. loss (validation data):  -8.76266980968571

In [139]:
pickle_file_path = 'project-group-6/test_kaggle.pickle'
with open(pickle_file_path, 'rb') as file:
    # Load the data from the file
    test_data = pickle.load(file)
test_data = pd.DataFrame(test_data)
X_test = test_df.values  # Convert the relevant feature columns to a NumPy array
X_test_tensor = torch.tensor(X_test, dtype=torch.float) 
pred = model(X_test_tensor)
pred2 = pred.detach().numpy()

In [140]:
from sklearn.metrics import r2_score
model = model3
N_train = len(dataset_train)
N_val = len(dataset_test)

avg_loss_per_epoch_train = []
avg_loss_per_epoch_val = []

for ep in range(num_epochs):

    model.train() # Put the model in "training mode"
    total_loss = []

    for x_batch, y_batch in dataloader_train:

    # These lines move the current batch of data to the GPU, if we're using the GPU.


        outputs = model(x_batch)
        outputs = outputs.squeeze()  # This removes any singleton dimensions
# Each item in the current batch is fed into the neural network
        avg_loss_for_this_batch = criterion(outputs, y_batch)

        model.zero_grad() # Wipe the slate clean, preparing for a new gradient calculation
        avg_loss_for_this_batch.backward() # Compute a gradient using the current batch
                                        # A more logical name for this method might be "compute_gradient"

    # Now we take one step of stochastic gradient descent or Adam.
    # (or whichever optimization algorithm we're using)
    # This short line of code updates all of the weights in our neural network.
        optimizer.step() 

    # We can see how powerful PyTorch is now.  In the previous two lines of code,
    # PyTorch did a very complicated gradient calculation for us,
    # shielding us from the details.
    # Then PyTorch updated the neural network weights, again shielding us from the details.
        outputs = model(x_batch)
        loss = r2_score( y_batch.detach().numpy(),outputs.detach().numpy())
        total_loss.append(loss)
    avg_training = np.mean(total_loss) 


    # We just finished one epoch of training.
    # Let's check how well our model is performing on the validation dataset.
    model.eval()
    total_loss = []
    for x_batch, y_batch in dataloader_test:


        with torch.no_grad(): # This line tells PyTorch it doesn't need to worry 
                            # about computing any gradients, for the moment
            outputs = model(x_batch)
            outputs = outputs.squeeze()  # This removes any singleton dimensions

            loss = r2_score( y_batch.detach().numpy(), outputs.detach().numpy())
            total_loss.append(loss)
    avg = np.mean(total_loss)
    
    print('epoch is: ', ep, ' Avg. loss (training data): ', avg_training, ' Avg. loss (validation data): ', avg.item())


epoch is:  0  Avg. loss (training data):  -5.87597062573529  Avg. loss (validation data):  -5.1480358729370606
epoch is:  1  Avg. loss (training data):  -7.104398193790626  Avg. loss (validation data):  -6.539245351049806
epoch is:  2  Avg. loss (training data):  -6.061277516511614  Avg. loss (validation data):  -5.142361612075407
epoch is:  3  Avg. loss (training data):  -5.283286077415467  Avg. loss (validation data):  -5.119171813029573
epoch is:  4  Avg. loss (training data):  -6.0080451686956495  Avg. loss (validation data):  -4.574335950509266
epoch is:  5  Avg. loss (training data):  -5.998786380114113  Avg. loss (validation data):  -8.392686706517066
epoch is:  6  Avg. loss (training data):  -6.0970100888230965  Avg. loss (validation data):  -5.465769848880639
epoch is:  7  Avg. loss (training data):  -5.811396625947307  Avg. loss (validation data):  -7.394546634881863
epoch is:  8  Avg. loss (training data):  -5.915698165520176  Avg. loss (validation data):  -7.959017077130116

In [145]:
pickle_file_path = 'project-group-6/test_kaggle.pickle'
with open(pickle_file_path, 'rb') as file:
    # Load the data from the file
    test_data = pickle.load(file)
test_data = pd.DataFrame(test_data)
X_test = test_df.values  # Convert the relevant feature columns to a NumPy array
X_test_tensor = torch.tensor(X_test, dtype=torch.float) 
pred = model(X_test_tensor)
pred3 = pred.detach().numpy()

In [151]:
pred3.shape

(132, 1)

In [152]:
df_predictions = pd.DataFrame({'Column1': pred1.flatten(), 'Column2': pred2.flatten(), 'Column3': pred3.flatten()})


In [153]:
df_predictions['avg_pred'] = df_predictions.mean(axis=1)
predic = df_predictions['avg_pred']

In [154]:
test_data['price'] = predic
predictions = test_data[['id','price']]
predictions.to_csv('prediction2.csv',index=False)