In [1]:
import numpy as np
import pandas as pd

### Reading data

In [2]:
data = pd.read_csv('input/train.csv')
data.shape

(10000, 20)

### Splitting data to train and valid

In [3]:
from sklearn.model_selection import train_test_split
# try to use stratify
train, valid = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
train.shape, valid.shape

((8000, 20), (2000, 20))

### Preparing data

In [5]:
# train.corr()
# lowest correlation: HouseYear
# high correlation: Rooms, Square, Floor, Social_1, Social_2, Helthcare_2, Shops_1, large_district, rich_district, big_flats_district, mean_dr_price, mean_d_square, mean_d_price, mean_sqm_price

In [6]:
mean_square = train['Square'].mean()
mean_price = train['Price'].mean()
mean_life_square = train['LifeSquare'].median()

In [7]:
def clean_up(df):
    # Healthcare_1 = too much data lost
    # LifeSquare - lots of data lost and contains lots of errors
    # KitchenSquare - lots of errors
    # HouseYear - errors, low correlation
    df = df.drop(['Healthcare_1', 'KitchenSquare', 'HouseYear'], axis=1)
    return df

In [8]:
def fix_floor(df):
    df.loc[df['Floor'] > df['HouseFloor'], 'Floor'] = df['HouseFloor']
    return df

In [9]:
def fix_square(df, square=mean_square):
    df.loc[df['Square'] > 250, 'Square'] = square
    return df

In [10]:
def fix_rooms(df):
    df.loc[df['Rooms'] > 6, 'Rooms'] = 6
    return df

In [11]:
def fix_lifesquare(df):
    df.loc[(df['LifeSquare'] < 10) & (df['Square'] > 10), 'LifeSquare'] = df.loc[(df['LifeSquare'] < 10) & (df['Square'] > 10), 'Square']*0.72
    df.loc[df['LifeSquare'].isnull(), 'LifeSquare'] = df.loc[df['LifeSquare'].isnull(), 'Square']*0.72
    return df

In [12]:
def make_dummies(df):
    for d in ['Shops_2', 'Ecology_2', 'Ecology_3']:
        df[d] = (df[d] == 'A').astype(int)
    return df

In [13]:
mean_sp_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Square','Price']].mean().rename(columns={'Price':'mean_dr_price', 'Square': 'mean_dr_square'})
mean_sp_d = train.groupby(['DistrictId'], as_index=False)[['Square', 'Price']].mean().rename(columns={'Price':'mean_d_price', 'Square': 'mean_d_square'})

In [14]:
large_district = pd.DataFrame((train['DistrictId'].value_counts() > 100).astype(int)).reset_index().rename(columns={'DistrictId': 'large_district', 'index': 'DistrictId'})

In [15]:
rich_district = pd.DataFrame((train['Price'] > mean_price).astype(int)).reset_index().rename(columns={'Price': 'rich_district', 'index': 'DistrictId'})

In [16]:
big_flats_district = pd.DataFrame((train['Square'] > mean_square).astype(int)).reset_index().rename(columns={'Square': 'big_flats_district', 'index': 'DistrictId'})

In [17]:
def add_distirct_attributes(df, large_districts=large_district, rich_districts=rich_district, big_flats=big_flats_district):
    df = pd.merge(df, large_districts, on=['DistrictId'], how='left')
    df['large_district'] = df['large_district'].fillna(0)
    
    df = pd.merge(df, rich_districts, on=['DistrictId'], how='left')
    df['rich_district'] = df['rich_district'].fillna(0)
    
    df = pd.merge(df, big_flats, on=['DistrictId'], how='left')
    df['big_flats_district'] = df['big_flats_district'].fillna(0)
    return df

In [18]:
def add_mean_prices(df, mean_for_dr=mean_sp_dr, mean_for_d=mean_sp_d, price=mean_price, square=mean_square):
    df = pd.merge(df, mean_for_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, mean_for_d, on=['DistrictId'], how='left')
    
    df['mean_dr_price'] = df['mean_dr_price'].fillna(price)
    df['mean_d_price'] = df['mean_d_price'].fillna(df['mean_dr_price'])
    
    df['mean_dr_square'] = df['mean_dr_square'].fillna(price)
    df['mean_d_square'] = df['mean_d_square'].fillna(df['mean_dr_square'])
    
    df['mean_sqm_price'] = df['mean_d_price'] / df['mean_d_square']
    return df

In [19]:
def prepare_data(df):
    df = clean_up(df)
    df = fix_rooms(df)
    df = fix_square(df)
    df = fix_floor(df)
    df = fix_lifesquare(df)
    df = make_dummies(df)
    df = add_distirct_attributes(df)
    df = add_mean_prices(df)
    return df

In [20]:
train = prepare_data(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 0 to 7999
Data columns (total 25 columns):
Id                    8000 non-null int64
DistrictId            8000 non-null int64
Rooms                 8000 non-null float64
Square                8000 non-null float64
LifeSquare            8000 non-null float64
Floor                 8000 non-null float64
HouseFloor            8000 non-null float64
Ecology_1             8000 non-null float64
Ecology_2             8000 non-null int64
Ecology_3             8000 non-null int64
Social_1              8000 non-null int64
Social_2              8000 non-null int64
Social_3              8000 non-null int64
Helthcare_2           8000 non-null int64
Shops_1               8000 non-null int64
Shops_2               8000 non-null int64
Price                 8000 non-null float64
large_district        8000 non-null int64
rich_district         8000 non-null float64
big_flats_district    8000 non-null float64
mean_dr_square        8000 non-null

In [21]:
valid = prepare_data(valid)
valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 25 columns):
Id                    2000 non-null int64
DistrictId            2000 non-null int64
Rooms                 2000 non-null float64
Square                2000 non-null float64
LifeSquare            2000 non-null float64
Floor                 2000 non-null float64
HouseFloor            2000 non-null float64
Ecology_1             2000 non-null float64
Ecology_2             2000 non-null int64
Ecology_3             2000 non-null int64
Social_1              2000 non-null int64
Social_2              2000 non-null int64
Social_3              2000 non-null int64
Helthcare_2           2000 non-null int64
Shops_1               2000 non-null int64
Shops_2               2000 non-null int64
Price                 2000 non-null float64
large_district        2000 non-null float64
rich_district         2000 non-null float64
big_flats_district    2000 non-null float64
mean_dr_square        2000 non-nu

### Preprocessing

In [22]:
# feats = ['Rooms', 'Square', 'LifeSquare', 'Floor', 'Social_1', 'Social_2', 'Helthcare_2', 'large_district', 'rich_district', 'big_flats_district', 'mean_d_price', 'mean_dr_price', 'mean_dr_square', 'mean_sqm_price']
feats = ['Rooms', 'Square', 'LifeSquare', 'Floor', 'HouseFloor', 'Social_1', 'Social_2', 'Helthcare_2', 'Shops_1', 'large_district', 'rich_district', 'big_flats_district', 'mean_d_price', 'mean_dr_price', 'mean_dr_square', 'mean_sqm_price']

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [24]:
train_scaled = pd.DataFrame(scaler.fit_transform(train.loc[:, feats]), columns=feats)
valid_scaled = pd.DataFrame(scaler.transform(valid.loc[:, feats]), columns=feats)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [25]:
from sklearn.cluster import KMeans

In [26]:
kmeans = KMeans(n_clusters=3, random_state=42)
labels_train = kmeans.fit_predict(train_scaled)
labels_valid = kmeans.predict(valid_scaled)

labels_train.shape, labels_valid.shape

((8000,), (2000,))

### Model

In [27]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

In [28]:
def get_prediction(model, df, target=None, evaluate='Yes'):
    pred = model.predict(df)
    if evaluate == 'Yes' and target is not None:
        print('R2: %s' % r2(target, pred))
        print('MSE: %s' % mse(target, pred))
    return pred

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [30]:

parameters = [{'n_estimators': [150, 200, 250, 300, 350, 400], 
               'max_features': np.arange(5, 13),
               'max_depth': np.arange(7, 10)}]

clf = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                  param_grid=parameters,
                  scoring='r2',
                  cv=5)


def get_best_params(clf, df, target, labels, cluster_count=2):
    best = {}
    for i in range(0, cluster_count):
        clf.fit(df[labels==i], target[labels==i])
        print('Best params for %d:' % i, clf.best_params_)
        best[i] = clf.best_params_
        
    return best
                     

In [31]:
def train_best(best, df, target, labels, cluster_count=2):
    result = {}
    for i in best:
        b = RandomForestRegressor(random_state=42, **best[i])
        b.fit(df[labels==i], target[labels==i])
        result[i] = b
    return result

In [32]:
# train_scaled[labels_train==2].info()

In [33]:
# best_params = get_best_params(clf, df=train_scaled, target=train['Price'], labels=labels_train, cluster_count=3)
best_params = {
    0: {'max_depth': 9, 'max_features': 7, 'n_estimators': 400},
    1: {'max_depth': 7, 'max_features': 6, 'n_estimators': 350},
    2: {'max_depth': 9, 'max_features': 6, 'n_estimators': 150}
}

In [34]:
best_models = train_best(best_params, df=train_scaled, target=train['Price'], labels=labels_train, cluster_count=3)

In [35]:
train_pred = {}
for i in best_models:
    train_pred[i] = get_prediction(best_models[i], train_scaled[labels_train==i], train['Price'][labels_train==i])

R2: 0.8964200459905027
MSE: 267570120.17949018
R2: 0.6976043376025217
MSE: 3218469376.307418
R2: 0.7332466151466923
MSE: 691171823.3908111


In [36]:
train_all = np.hstack([train['Price'][labels_train==i] for i in best_models])
train_pred_all = np.hstack(train_pred.values())

print('R2 for all Train: %s' % r2(train_all, train_pred_all))

R2 for all Train: 0.8625470416383223


In [37]:
valid_pred = {}
for i in best_models:
    valid_pred[i] = get_prediction(best_models[i], valid_scaled[labels_valid==i], valid['Price'][labels_valid==i])

R2: 0.7612444792814325
MSE: 712582230.034672
R2: 0.33417529831156056
MSE: 7594597988.284365
R2: 0.46192823050051435
MSE: 1583655190.9477704


In [38]:
valid_all = np.hstack([valid['Price'][labels_valid==i] for i in best_models])
valid_pred_all = np.hstack(valid_pred.values())

print('R2 for all Valid: %s' % r2(valid_all, valid_pred_all))

R2 for all Valid: 0.6869213452217804


### Test

In [39]:
test = pd.read_csv('input/test.csv')
test.shape

(5000, 19)

In [40]:
test = prepare_data(test)
test_scaled = pd.DataFrame(scaler.transform(test.loc[:, feats]), columns=feats)
labels_test = kmeans.predict(test_scaled)

  


In [41]:
for i in best_models:
    test.loc[labels_test==i, 'Price'] = get_prediction(best_models[i], test_scaled[labels_test==i], target=None, evaluate='No')

In [42]:
test['Price'].describe()

count      5000.000000
mean     214122.818134
std       76701.297964
min       68112.650300
25%      164658.199929
50%      195968.207157
75%      247418.179068
max      560870.822370
Name: Price, dtype: float64

In [43]:
test.loc[:, ['Id', 'Price']].to_csv('MLevanov_predictions.csv', index=False)

In [44]:
test_check = pd.read_csv('MLevanov_predictions.csv')
test_check.shape

(5000, 2)

In [45]:
test_check.columns

Index(['Id', 'Price'], dtype='object')