In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings


%matplotlib inline
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [2]:
def drop_nan_columns(dataframe: pd.DataFrame,
                     columns: list[str],
                     name: str) -> pd.DataFrame:
    """
    Drop columns with multiple NAN values
    
    :dataframe: 
        – original pd.DataFrame(s)
    :columns: 
        – list of columns to cl
    :return: 
        – return modified pd.DataFrame
    
    """

    for column in columns:
        dataframe.drop(column, axis=1, inplace=True)
    dataframe.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/dropped_vals/dropped_values_{name}.csv')

In [3]:
# Drop useless / columns with multiple NAN values
drop_nan_columns(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/raw/real_estate_train.csv'), 
                 columns=['Unnamed: 0', 'bedrooms_count', 'balcony', 'year_built'], 
                 name='train')

drop_nan_columns(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/raw/real_estate_test.csv'), 
                 columns=['Unnamed: 0', 'bedrooms_count', 'balcony', 'year_built'], 
                 name='test')



In [4]:
df = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/dropped_vals/dropped_values_train.csv')
df.head(20)
print(df.isna().sum())

Unnamed: 0                 0
floor_num                  0
total_area                 0
kitchen_area            1782
rooms_count              832
living_area             1908
is_apartments           1201
total_floor                0
district                   0
hood                       0
metro_name                 0
metro_transport_type       0
metro_time                 0
price                      0
dtype: int64


In [5]:
def filling_by_0(dataframe: pd.DataFrame,
                 columns: list[str],
                 name: str) -> None:
    """
    Filling columns by 0
    
    :dataframes: 
        – original pd.DataFrame
        
    :columns: 
        – list of columns to fill
        
    :return: 
        – return modified pd.DataFrame
    
    """
    for column in columns:
        dataframe[column] = dataframe[column].fillna(0)
    dataframe.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/filled_by_0/filled_by_0_{name}.csv')

In [6]:
# Studio flats have no separate rooms. So, filling column by 0. 
# It is not bad idea to fill in the missing values 0 for the baseline
filling_by_0(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/dropped_vals/dropped_values_train.csv'), 
             columns=['rooms_count', 'kitchen_area','living_area'], 
             name='train')

filling_by_0(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/dropped_vals/dropped_values_test.csv'), 
             columns=['rooms_count', 'kitchen_area','living_area'], 
             name='test')

In [7]:
df = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/filled_by_0/filled_by_0_train.csv')
df.isna().sum()

Unnamed: 0.1               0
Unnamed: 0                 0
floor_num                  0
total_area                 0
kitchen_area               0
rooms_count                0
living_area                0
is_apartments           1201
total_floor                0
district                   0
hood                       0
metro_name                 0
metro_transport_type       0
metro_time                 0
price                      0
dtype: int64

In [8]:
def to_binary(dataframe: pd.DataFrame, 
              columns: list[str],
              name: str) -> pd.DataFrame:
    """
    Fill the missing values with the most frequent value and transform them to binary format
    
    :dataframes: 
        – original pd.DataFrame(s)
        
    :columns: 
        – list of columns to fill
        
    :return:
        – return modified pd.DataFrame
    """
    binaries = {'t': 1, 'f': 0}

    for column in columns:
        dataframe[column] = dataframe[column].fillna(dataframe[column].mode()[0])
        dataframe[column] = dataframe[column].replace(binaries).astype(int)
    dataframe.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/binary_form/binary_format_{name}.csv')

In [9]:
to_binary(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/filled_by_0/filled_by_0_train.csv'), 
          columns=['is_apartments'],
          name='train')

to_binary(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/filled_by_0/filled_by_0_test.csv'), 
          columns=['is_apartments'],
          name='test')

In [10]:
df = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/binary_form/binary_format_train.csv')
df.isna().sum()

Unnamed: 0.2            0
Unnamed: 0.1            0
Unnamed: 0              0
floor_num               0
total_area              0
kitchen_area            0
rooms_count             0
living_area             0
is_apartments           0
total_floor             0
district                0
hood                    0
metro_name              0
metro_transport_type    0
metro_time              0
price                   0
dtype: int64

In [11]:
def split_district(dataframe: pd.DataFrame, 
                   name: str,
                   pattern: str,
                   column='district') -> None:
    """
    Split district column
    """
    dataframe[['district', 'district_naming']] = df[column].str.split(pattern, expand=True)
    dataframe.drop('district_naming', axis=1, inplace=True)
    dataframe.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/splitted_district/splitted_district_{name}.csv')

In [12]:
split_district(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/binary_form/binary_format_train.csv'), 
    name='train',
    pattern=' ',
)

split_district(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/binary_form/binary_format_train.csv'), 
    name='test',
    pattern=' ',
)

In [13]:
df = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/splitted_district/splitted_district_train.csv')
df['district'].value_counts()

district
ЦАО      1565
ЗАО      1041
НАО       706
САО       530
ЮАО       498
СЗАО      491
СВАО      353
ЮВАО      334
ЮЗАО      332
ВАО       215
ЗелАО      32
ТАО        29
Name: count, dtype: int64

In [14]:
def clean_outliers(dataframe: pd.DataFrame,
                   name: str) -> None:
    """
    Delete rows in which the value of any of the features exceeds and writes it to .csv file
    
    :dataframes:
        - original pd.DataFrame

    :return:
        - modified pd.DataFrame
    """
    dataframe = dataframe[dataframe['total_area'] < dataframe['total_area'].quantile(.8)]
    dataframe = dataframe[dataframe['total_area'] > 15]
    dataframe.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/cleaned_outliers/cleaned_outliers_{name}.csv')

In [15]:
clean_outliers(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/splitted_district/splitted_district_train.csv'),
    'train')

clean_outliers(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/splitted_district/splitted_district_test.csv'),
    'test')

In [16]:
df = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/cleaned_outliers/cleaned_outliers_train.csv')
print(df.shape)

(4886, 18)


In [17]:
def encoding(df: pd.DataFrame,
             name: str) -> pd.DataFrame:
    """
    Function perfroms one-hot-encoding for categorical features and writes it to .csv file

    :input:
        – original pd.DataFrame

    :name:
        - naming of output .csv file

    :return:
        - modified pd.DataFrame 
    """
    
    binaries = {True: 1, False: 0}
    df = pd.get_dummies(data=df, drop_first=True)
    df.replace(binaries, inplace=True)
    df.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/encoding/encoding_{name}.csv')

In [18]:
encoding(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/cleaned_outliers/cleaned_outliers_train.csv'),
    'train'
)

encoding(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/cleaned_outliers/cleaned_outliers_test.csv'),
    'test'
)

In [19]:
def predictors_n_target(dataframe: pd.DataFrame,
                        name: str) -> (pd.DataFrame | pd.Series):
    """
    Splitting data into predictors and target variables
    """
    X = dataframe.drop('price', axis=1)
    y = dataframe['price']
    X.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/{name}_X.csv')
    y.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/Y/{name}_y.csv')

In [20]:
predictors_n_target(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/encoding/encoding_train.csv'),
    'train'
)

predictors_n_target(
    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/encoding/encoding_test.csv'),
    'test'
)

In [21]:
def transform(dataframe: pd.DataFrame,
              name: str) -> pd.DataFrame:
    """
    Function performs Yeo-Jonhson transformation
    """
    pt = PowerTransformer()
    X_scaled = pt.fit_transform(dataframe)
    X_scaled = pd.DataFrame(X_scaled, columns=[i for i in dataframe.columns if i != 'price'])
    X_scaled.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/{name}_X_scaled.csv')

In [22]:
transform(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/train_X.csv'), 'train')
transform(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/test_X.csv'), 'test')


In [23]:
def validation_sampling(X: pd.DataFrame, 
                        y: pd.Series,
                        test_size: float,
                        seed: int) -> pd.DataFrame:
    """
    Splitting data into holdout sample
    
    :X: 
        - predictor variables presented as pd.DataFrame
        
    :y: 
        - target variable as pd.Series
        
    :test_size: 
        - size of sample
        
    :seed: 
        - randomness variable
    """
    X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=test_size, random_state=seed)
    X_train.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/X_train.csv')
    X_test.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/X_test.csv')
    y_train.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/Y/y_train.csv')
    y_test.to_csv(f'/Users/klimbelousov/Documents/Projects/real_estate/data/interim/Y/y_test.csv')
    

In [26]:
validation_sampling(pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/train_X_scaled.csv'), 
                    pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/Y/train_y.csv'), 
                    test_size=0.2, 
                    seed=52)

In [4]:
def data_to_processed() -> None:
    X_train = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/X_train.csv')
    X_test = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/X/X_test.csv')
    y_train = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/Y/y_train.csv')
    y_test = pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/interim/Y/y_test.csv')
    dataframes = [X_train, X_test, y_train, y_test]
    for df in dataframes:
        for column in df.columns:
            if 'Unnamed' in column:
                df.drop(column, axis=1, inplace=True)
    X_train.to_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/processed/X_train.csv', index=False)
    X_test.to_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/processed/X_test.csv', index=False)
    y_train.to_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/processed/y_train.csv', index=False)
    y_test.to_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/processed/y_test.csv', index=False)



In [6]:
data_to_processed()

In [11]:
rf = RandomForestRegressor(n_estimators=350, min_samples_leaf=3, min_samples_split=3, n_jobs=-1)

rf.fit(
    X=pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/processed/X_train.csv'),
    y=pd.read_csv('/Users/klimbelousov/Documents/Projects/real_estate/data/processed/y_train.csv')
)

In [87]:
linear_lasso(X_train, y_train, X_valid, alpha=0.05, seed=13)

(array([1.48624889e+07, 1.24184410e+06, 4.42156814e+08, ...,
        4.29782956e+06, 2.25502496e+08, 5.69681318e+07]),
 0.7577030298373951)

In [None]:
linear_lasso()