# 0. Imports & methods

In [None]:
import datetime

import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

from lightgbm import LGBMRegressor

import seaborn as sns

from sklearn.metrics import mean_squared_error as mse, r2_score as r2

#from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor, Pool

#import shap

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def plot_feature_importances(importances, X):
    
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize = (20, 6))
    plt.title("Feature importances", fontsize=16)
    plt.bar(range(X.shape[1]), importances[indices] / importances.sum(),
           color="darkblue", align="center")
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation = 90, fontsize=14)
    plt.xlim([-1, X.shape[1]])

    plt.tight_layout()
    # plt.savefig('fe.jpg')
    plt.show()

In [None]:
def show_plt(xdata, ydata, xlab, ylab, fntsize, fgsize, rot, ttl):
    plt.figure(figsize=fgsize)
    sns.barplot(x=xdata, y=ydata)
    plt.title(ttl)
    plt.ylabel(ylab, fontsize=fntsize)
    plt.xlabel(xlab, fontsize=fntsize)
    plt.xticks(rotation=rot)
    plt.show()

In [None]:
# Source: https://www.kaggle.com/vikassingh1996/don-t-underestimate-the-power-of-a-logistic-reg

def description(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values
    summary['PercMissing'] = df.isnull().sum().values / df.isnull().count().values
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary
print('**Variable Description of  train Data:**')

In [None]:
# Source: https://www.kaggle.com/vikassingh1996/don-t-underestimate-the-power-of-a-logistic-reg

def replace_nan(data):
    for column in data.columns:
        if data[column].isna().sum() > 0:
            data[column] = data[column].fillna(data[column].mode()[0])


# 1. Read data

In [None]:
df = pd.read_csv('/kaggle/input/realestatepriceprediction/train.csv')

In [None]:
df = reduce_mem_usage(df)

In [None]:
df.dtypes

In [None]:
X = df.drop('Price', axis=1)
y = df['Price']

X_test = pd.read_csv('/kaggle/input/realestatepriceprediction/test.csv')

preds_test = pd.DataFrame()
preds_test['Id'] = X_test['Id'].copy()

X.set_index('Id', inplace=True)
X_test.set_index('Id', inplace=True)

In [None]:
X.head(10)

In [None]:
description(X)

In [None]:
replace_nan(X)
replace_nan(X_test)

In [None]:
X.info()

In [None]:
X.describe()

In [None]:
sns.distplot(y, color='r')

In [None]:
sns.distplot(X['Healthcare_1'], color='r')

In [None]:
X['Healthcare_1'].fillna(X['Healthcare_1'].mean(), inplace=True)

In [None]:
X_test['Healthcare_1'].fillna(X_test['Healthcare_1'].mean(), inplace=True)

In [None]:
X['DistrictId'] = X['DistrictId'].astype(str)
X_test['DistrictId'] = X_test['DistrictId'].astype(str)

## 1.  EDA

In [None]:
X['Rooms_outlier'] = 0  #add column
X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
X.head()        
X.loc[X['Rooms'] == 0, 'Rooms'] = 1
#X.loc[X['Rooms'] >= 6, 'Rooms'] = X['Rooms'].median # !!! медиана посчитана на трейне !!!
#print(X['Rooms'].median)

In [None]:
r_qt = (X['Rooms'].value_counts()).reset_index()
r_qt.columns=['Rooms', 'qty']
show_plt(r_qt['Rooms'], r_qt['qty'], 'square', 'qty', 12, (16,4), 90, 'Rooms qty')

In [None]:
k_sq = (X['KitchenSquare'].value_counts()).reset_index()
k_sq.columns=['square', 'qty']
show_plt(k_sq['square'], k_sq['qty'], 'square', 'qty', 12, (16,4), 90, 'kitchen square qty')

In [None]:
hfl = (X['HouseFloor'].value_counts().sort_values()).reset_index()
hfl.columns=['housefloor', 'qty']
show_plt(hfl['housefloor'], hfl['qty'], 'square', 'qty', 12, (16,4), 90, 'house floor qty')

In [None]:
X[X['HouseFloor'] < X['Floor']]

In [None]:
d_id = (X['DistrictId'].value_counts()).reset_index()
d_id.columns=['DistrictId', 'qty']
d_id['DistrictId'] = d_id['DistrictId'].apply(np.int)
show_plt(d_id['DistrictId'], d_id['qty'], 'square', 'qty', 12, (16,4), 90, 'DistrictId')

#d_id.info()
#d_id.sort_values('DistrictId').head(50)

In [None]:
h_y = (X['HouseYear'].value_counts()).reset_index()
h_y.columns=['year', 'qty']
show_plt(h_y['year'], h_y['qty'], 'square', 'qty', 12, (16,4), 90, 'HouseYear')

In [None]:
X['Floor'].sort_values().unique()

In [None]:
X[X['HouseYear'] > 2020].head()

In [None]:
X[X['Square'] > 300]

In [None]:
X[X['Square'] < X['LifeSquare']]

In [None]:
X[X['Square'] > X['Rooms'] * 100]

# 2. Data cleaning

In [None]:
class FeatureImputer:
    """Заполнение пропусков и обработка выбросов"""
    
    def __init__(self):
        self.medians=None
        
    def fit(self, X):
        self.medians = X.median()
    
    def transform(self, X):
        X['HouseFloor'] = X['HouseFloor'].apply(np.int)
        X['Rooms'] = X['Rooms'].apply(np.int)
        X['Floor'] = X['Floor'].apply(np.int)
        
        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms'] # !!! медиана посчитана на трейне !!!
        
        # KitchenSquare
        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        X.loc[X['KitchenSquare'] > 1000, 'KitchenSquare'] = X.loc[X['KitchenSquare'] > 1000, 'KitchenSquare'] / 10  # можно median
        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'Floor'] = X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor']
        
        # HouseYear
        current_year = now = datetime.datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        
        # Healthcare_1
     #   if 'Healthcare_1' in X.columns:
     #       X.drop('Healthcare_1', axis=1, inplace=True)
            
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        
        condition = (X['LifeSquare'].isna()) &\
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3
        
        condition_2 = (X['Square'] < X['LifeSquare'])
        X.loc[condition_2, 'LifeSquare'] = X.loc[condition_2, 'Square'] - X.loc[condition_2, 'KitchenSquare'] - 3
        
        condition_3 = (X['Square'] > 300)
        X.loc[condition_3, 'Square'] = X.loc[condition_3, 'Rooms'] * 30
        X.loc[condition_3, 'LifeSquare'] = X.loc[condition_3, 'Square'] - X.loc[condition_3, 'KitchenSquare']
        
        return X
        

In [None]:
imputer = FeatureImputer()

imputer.fit(X) # расчет медиан

X = imputer.transform(X)
X_test = imputer.transform(X_test)

# 3. Feature engineering

In [None]:
class FeatureGenetator():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_price_by_district = None
    #    self.med_price_by_floor_year = None
        self.med_price_by_square = None
    
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        # DistrictID
        district = X['DistrictId'].value_counts()
        district = district[district > 50]  ## Если вы этого не сделаете, то на Leaderboard-е улетите в самый низ (>100 места, R2 ~ 0.65),
                                            ## Хотя на локальной валидации скор вырастет
        
        self.DistrictId_counts = dict(district)
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # Target encoding
        ## District
        df = X.copy()
        
        if y is not None:
            df['Price'] = y.values
            
            df['DistrictId_popular'] = df['DistrictId'].copy()
            df.loc[~df['DistrictId_popular'].isin(district.keys().tolist())] = np.nan
            
            self.med_price_by_district = df.groupby(['DistrictId_popular', 'Rooms'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByDistrict',
                                                           'DistrictId_popular': 'DistrictId'})
            
        ## floor, year
        if y is not None:
            df['Price'] = y.values
           # df = self.floor_to_cat(df)
          #  df = self.year_to_cat(df)
            df = self.square_to_cat(df)
    
            self.med_price_by_square = df.groupby(['square_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceBySquare'})
        

        
    def transform(self, X):
        
        # DistrictId
        X['DistrictId_count'] = X['DistrictId'].map(self.DistrictId_counts)  # self.DistrictId_counts = {'id': value}
        
        X['new_district'] = 0
        X.loc[X['DistrictId_count'].isna(), 'new_district'] = 1
        
        X['DistrictId_count'].fillna(5, inplace=True)
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # More categorical features
      #  X = self.floor_to_cat(X)  # + столбец floor_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        X = self.square_to_cat(X)
        
        # Target encoding
        if self.med_price_by_district is not None:
            X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
        if self.med_price_by_square is not None:
            #X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            X = X.merge(self.med_price_by_square, on=['square_cat'], how='left')
        
        # добавляем фичу в виде уникального ID здания
        X["BldId"] = X.groupby(["DistrictId", "HouseFloor", "HouseYear", "Ecology_1",
                        "Ecology_2", "Ecology_3", "Social_1", "Social_2", "Social_3", 
                         "Helthcare_2", "Shops_1", "Shops_2"]).ngroup()

        
        return X
    
#    @staticmethod
#    def floor_to_cat(X):
#        X['floor_cat'] = np.nan
#        X.loc[X['Floor'] <= 4, 'floor_cat'] = 0  
#        X.loc[X['Floor'] == 5, 'floor_cat'] = 1
#        X.loc[X['Floor'] >= 6 & (X['Floor'] <= 8), 'floor_cat'] = 2
#        X.loc[X['Floor'] == 9, 'floor_cat'] = 3
#        X.loc[(X['Floor'] >= 10) & (X['Floor'] <= 13), 'floor_cat'] = 4
#        X.loc[(X['Floor'] >= 14) & (X['Floor'] <= 16), 'floor_cat'] = 5
#        X.loc[X['Floor'] == 17, 'floor_cat'] = 6
#        X.loc[(X['Floor'] >= 18) & (X['Floor'] <= 21), 'floor_cat'] = 7
#        X.loc[(X['Floor'] >= 22) & (X['Floor'] <= 25), 'floor_cat'] = 8
#        X.loc[X['Floor'] >= 26, 'floor_cat'] = 9   
#        return X
    
    @staticmethod
    def square_to_cat(X):
        X['square_cat'] = np.nan
        X.loc[X['Square'] < 19, 'square_cat'] = 1
        X.loc[(X['Square'] >= 20) & (X['Square'] <= 49), 'square_cat'] = 2
        X.loc[(X['Square'] >= 50) & (X['Square'] <= 99), 'square_cat'] = 3
        X.loc[(X['Square'] >= 100) & (X['Square'] <= 149), 'square_cat'] = 4
        X.loc[(X['Square'] >= 150) & (X['Square'] <= 199), 'square_cat'] = 5
        X.loc[(X['Square'] >= 200), 'square_cat'] = 6
        return X
    
    @staticmethod
    def year_to_cat(X):
        X['year_cat'] = np.nan
        X.loc[X['HouseYear'] < 1941, 'year_cat'] = 1
        X.loc[(X['HouseYear'] >= 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2
        X.loc[(X['HouseYear'] >= 1946) & (X['HouseYear'] <= 1976), 'year_cat'] = 3
        X.loc[(X['HouseYear'] == 1977), 'year_cat'] = 4
        X.loc[(X['HouseYear'] >= 1978) & (X['HouseYear'] <= 2000), 'year_cat'] = 5
        X.loc[(X['HouseYear'] >= 2001) & (X['HouseYear'] <= 2013), 'year_cat'] = 6
        X.loc[(X['HouseYear'] >= 2014) & (X['HouseYear'] <= 2018), 'year_cat'] = 7
        X.loc[(X['HouseYear'] >= 2019), 'year_cat'] = 8  
        return X

In [None]:
features = FeatureGenetator()

features.fit(X, y)

X = features.transform(X)
X_test = features.transform(X_test)

In [None]:
nan_df = (X.isna().sum() / X.shape[0]).reset_index()
nan_df.columns=['feature', 'nan_percent']

plt.figure(figsize=(16,4))
sns.barplot(nan_df['feature'], nan_df['nan_percent'])
plt.title('Percent Missing')
plt.ylabel('Missing', fontsize=12)
plt.xlabel('Features', fontsize=12)
plt.xticks(rotation=90)
plt.show()

In [None]:
X.info()
#X['Ecology_2'].head()

In [None]:
X_test.info()

In [None]:
cat_columns = df.select_dtypes(['category']).columns

cat_columns
X[cat_columns] = X[cat_columns].apply(lambda c: c.cat.codes)

In [None]:
remain_features = [#'Rooms', 
                   'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'Ecology_1', 'Social_1', 'Shops_1',
                  # 'Rooms_outlier',
                  # 'HouseFloor_outlier', #'HouseYear_outlier', 
                  # 'LifeSquare_nan',
                   'HouseFloor', 'HouseYear','BldId',# 'Ecology_2',
                   'DistrictId_count', #'new_district', 'Ecology_3',
                  # 'Shops_2',
                   'MedPriceByDistrict',
                 #  'MedPriceByFloorYear'
                   'MedPriceBySquare'
                  ]

X = X[remain_features]
X_test = X_test[remain_features]

In [None]:
X_num_features = X.dtypes[X.dtypes != 'object'].index
X_cat_features = X.dtypes[X.dtypes == 'object'].index

print(f'qty of num features: {len(X_num_features)}')
print(X[X_num_features].columns)
print()
print(f'qty of cat features: {len(X_cat_features)}')
print(X[X_cat_features].columns)

In [None]:
X_test_num_features = X_test.dtypes[X_test.dtypes != 'object'].index
X_test_cat_features = X_test.dtypes[X_test.dtypes == 'object'].index

print(f'qty of num features: {len(X_test_num_features)}')
print(X_test[X_test_num_features].columns)
print()
print(f'qty of cat features: {len(X_test_cat_features)}')
print(X_test[X_test_cat_features].columns)

In [None]:
df_num_features = X.copy()
df_num_features.hist(figsize=(16,16), bins=20, grid=False);

In [None]:
X.info()

In [None]:
fix, ax = plt.subplots()
ax.scatter(x=X['Square'], y=y)
plt.ylabel('Price', fontsize=13)
plt.xlabel('Square', fontsize=13)
plt.show()

In [None]:
#sc = StandardScaler()
#all_X_sc = sc.fit_transform(X)
#X = pd.DataFrame(all_X_sc)
X.head()

In [None]:
X.info()

# 4. Model & prediction

In [None]:
%%time

model = CatBoostRegressor(depth=2, 
                          learning_rate=0.1, 
                          n_estimators=300,
                          loss_function='RMSE',
                          custom_metric='R2',
                          random_seed=42,
                          silent=True
                         )

model.fit(X, y, plot=True, verbose=False)

cv_score = cross_val_score(
    model,
    X,
    y,
    scoring='r2',
    cv=KFold(
            n_splits=5,
            shuffle=True,
            random_state=42
    )
)

print(f'R2: {round(cv_score.mean(), 3)}')

In [None]:
%%time

lgbm_model = LGBMRegressor(max_depth=7,
                      #       min_samples_leaf=10,
                             n_estimators=300,
                             random_state=42,
                     num_leaves=30)

cv_score = cross_val_score(
    lgbm_model,
    X,
    y,
    scoring='r2',
    cv=KFold(
            n_splits=5,
            shuffle=True,
            random_state=42
    )
)

print(f'R2: {round(cv_score.mean(), 3)}')

In [None]:
%%time

final_model = CatBoostRegressor(
    silent=True,
    learning_rate=0.1,
    iterations=1150,
    eval_metric='R2',
    depth=8
)

final_model.fit(X, y, plot=True)

cv_score = cross_val_score(
    final_model,
    X,
    y,
    scoring='r2',
    cv=KFold(
            n_splits=5,
            shuffle=True,
            random_state=42
    )
)

print(f'R2: {round(cv_score.mean(), 3)}')

y_pred_test = final_model.predict(X_test)

preds_test['Price'] = y_pred_test
preds_test.to_csv('predictions.csv', index=False)

preds_test.head()

In [None]:
plot_feature_importances(importances = final_model.feature_importances_, X=X)