In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
import glob
import zipfile

In [41]:
#unzip files
filenames = glob.glob('*zip')
print(filenames)
for file in filenames:
    with zipfile.ZipFile(file,"r") as zip_ref:
        zip_ref.extractall("apartments_pl")

['apartments_pl_2024_02.csv.zip', 'apartments_pl_2023_09.csv.zip', 'apartments_pl_2023_10.csv.zip', 'apartments_pl_2023_11.csv.zip', 'apartments_pl_2023_08.csv.zip', 'apartments_pl_2024_01.csv.zip', 'apartments_pl_2023_12.csv.zip']


In [45]:
#load files 
filenames = glob.glob('apartments_pl/*csv')
df = pd.DataFrame()
for file in filenames:
    df_temp = pd.read_csv(file)
    #make date value
    date = pd.to_datetime(file.split('_')[3] + file.split('_')[4].split('.')[0] + '01')
    #add date column and date value for each row #what should the type be?
    df_temp["date"] = date
    df = pd.concat([df, df_temp])
df.shape[0]

117259

In [50]:
#drop observations
df = df.dropna()

#convert to numerical data
df['condition'] = df['condition'].map({'low': 0, 'premium': 1})
df['type'] = df['type'].map({'apartmentBuilding': 1, 'blockOfFlats': 2, 'tenement': 3})
for col in ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']:
    df[col] = df[col].map({'no': 0, 'yes': 1})

for col in ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom', 'condition', 'type']:
    df[col] = df[col].astype(np.int8)

#drop features
toDrop = ['id', 'ownership'] #buildMaterial, city
df = df.drop(toDrop, axis=1)


In [None]:
#drop low variance featurse
sel = VarianceThreshold(threshold=.05)
sel.fit(df/df.mean())
mask = sel.get_support() 
#Still need longtude, latitude, and buildYear
df = df.loc[:, mask]

In [46]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 117259 entries, 0 to 16360
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   id                    117259 non-null  object        
 1   city                  117259 non-null  object        
 2   type                  91264 non-null   object        
 3   squareMeters          117259 non-null  float64       
 4   rooms                 117259 non-null  float64       
 5   floor                 96041 non-null   float64       
 6   floorCount            115690 non-null  float64       
 7   buildYear             97426 non-null   float64       
 8   latitude              117259 non-null  float64       
 9   longitude             117259 non-null  float64       
 10  centreDistance        117259 non-null  float64       
 11  poiCount              117259 non-null  float64       
 12  schoolDistance        117133 non-null  float64       
 13  clini

Unnamed: 0,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,poiCount,schoolDistance,clinicDistance,postOfficeDistance,kindergartenDistance,restaurantDistance,collegeDistance,pharmacyDistance,price,date
count,117259.0,117259.0,96041.0,115690.0,97426.0,117259.0,117259.0,117259.0,117259.0,117133.0,116763.0,117088.0,117121.0,116936.0,113878.0,117070.0,117259.0,117259
mean,59.367188,2.701848,3.304412,5.235699,1985.299509,52.045362,19.470735,4.318678,20.690037,0.415219,0.973939,0.520115,0.373511,0.351381,1.441477,0.363755,756938.7,2023-10-29 00:58:51.628275968
min,25.0,1.0,1.0,1.0,1850.0,49.978999,14.447127,0.02,0.0,0.002,0.001,0.001,0.001,0.001,0.006,0.001,150000.0,2023-08-01 00:00:00
25%,44.65,2.0,2.0,3.0,1965.0,51.111693,18.518341,1.96,7.0,0.175,0.355,0.239,0.157,0.115,0.58,0.143,499000.0,2023-09-01 00:00:00
50%,55.4,3.0,3.0,4.0,1994.0,52.19592,19.895674,3.92,13.0,0.29,0.675,0.393,0.265,0.231,1.119,0.241,679000.0,2023-11-01 00:00:00
75%,69.7,3.0,4.0,6.0,2015.0,52.441367,20.99135,6.12,24.0,0.469,1.237,0.625,0.419,0.412,2.054,0.408,899000.0,2024-01-01 00:00:00
max,150.0,6.0,29.0,29.0,2024.0,54.60646,23.207128,16.94,212.0,4.946,4.998,4.97,4.961,4.985,5.0,4.992,3250000.0,2024-02-01 00:00:00
std,21.661587,0.923175,2.503613,3.266448,34.244332,1.345636,1.788524,2.860922,24.401407,0.475026,0.897756,0.509243,0.458528,0.47809,1.103144,0.472445,390449.1,


In [47]:
df.to_csv('full_dataset.csv', index=False)

In [51]:
df.columns

Index(['city', 'type', 'squareMeters', 'rooms', 'floor', 'floorCount',
       'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount',
       'schoolDistance', 'clinicDistance', 'postOfficeDistance',
       'kindergartenDistance', 'restaurantDistance', 'collegeDistance',
       'pharmacyDistance', 'buildingMaterial', 'condition', 'hasParkingSpace',
       'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom', 'price',
       'date'],
      dtype='object')