In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [62]:
df = pd.read_csv('Houses_train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,5546,130000.0,newly repaired,Center,4,Sayat Nova Ave,3,Yerevan,96.0,http://www.myrealty.am/en/item/28244/3-senyaka...,1,stone,3,3.2
1,2979,65000.0,good,Arabkir,5,Hr.Kochar St,3,Yerevan,78.0,http://www.myrealty.am/en/item/18029/3-senyaka...,1,stone,2,2.8
2,2698,129000.0,good,Center,10,M.Khorenatsi St,3,Yerevan,90.0,http://www.myrealty.am/en/item/37797/3-senyaka...,1,panel,3,2.8
3,4548,52000.0,newly repaired,Center,14,Argishti St,2,Yerevan,53.0,http://www.myrealty.am/en/item/36153/2-senyaka...,1,monolit,5,3.0
4,2982,65000.0,newly repaired,Center,12,Mashtots Ave,2,Yerevan,47.0,http://www.myrealty.am/en/item/17566/2-senyaka...,1,panel,3,2.8


In [63]:
#Removing unnecessary features
cols=[]
for col in df.columns:
    if df[col].nunique()==1 or df[col].nunique()==len(df):
            cols.append(col)
df=df.drop(cols, axis=1)
df.head()

Unnamed: 0,price,condition,district,max_floor,street,num_rooms,area,num_bathrooms,building_type,floor,ceiling_height
0,130000.0,newly repaired,Center,4,Sayat Nova Ave,3,96.0,1,stone,3,3.2
1,65000.0,good,Arabkir,5,Hr.Kochar St,3,78.0,1,stone,2,2.8
2,129000.0,good,Center,10,M.Khorenatsi St,3,90.0,1,panel,3,2.8
3,52000.0,newly repaired,Center,14,Argishti St,2,53.0,1,monolit,5,3.0
4,65000.0,newly repaired,Center,12,Mashtots Ave,2,47.0,1,panel,3,2.8


In [64]:
for col in df.columns:
    print("\n")
    print(pd.DataFrame(df[col].value_counts()))



          price
65000.0     140
60000.0     130
45000.0     129
70000.0     124
85000.0     122
...         ...
33700.0       1
65500.0       1
35600.0       1
173000.0      1
96900.0       1

[333 rows x 1 columns]


                condition
newly repaired       2867
good                 1713
zero condition        420


                  district
Center                2086
Arabkir               1320
Avan                   252
Davtashen              235
Malatia-Sebastia       235
Nor Norq               233
Qanaqer-Zeytun         201
Achapnyak              192
Shengavit              163
Erebuni                 75
Norq Marash              5
Vahagni district         2
Nubarashen               1


    max_floor
5        1356
9        1009
4         501
14        448
10        325
16        261
12        193
6         165
11        152
7         121
13        101
15         80
3          61
8          58
17         45
18         41
19         17
21         16
23         14
22         14


In [65]:
df.describe()

Unnamed: 0,price,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,85660.0866,8.6976,2.6908,81.5334,1.1662,5.1666,2.89476
std,51328.921854,4.148349,0.822758,24.715806,0.40867,3.395578,0.144861
min,18500.0,1.0,1.0,27.0,1.0,0.0,2.6
25%,50000.0,5.0,2.0,65.0,1.0,3.0,2.8
50%,70000.0,9.0,3.0,80.0,1.0,4.0,2.8
75%,105000.0,11.0,3.0,97.0,1.0,7.0,3.0
max,550000.0,23.0,6.0,149.0,4.0,22.0,3.8


In [66]:
df.head()

Unnamed: 0,price,condition,district,max_floor,street,num_rooms,area,num_bathrooms,building_type,floor,ceiling_height
0,130000.0,newly repaired,Center,4,Sayat Nova Ave,3,96.0,1,stone,3,3.2
1,65000.0,good,Arabkir,5,Hr.Kochar St,3,78.0,1,stone,2,2.8
2,129000.0,good,Center,10,M.Khorenatsi St,3,90.0,1,panel,3,2.8
3,52000.0,newly repaired,Center,14,Argishti St,2,53.0,1,monolit,5,3.0
4,65000.0,newly repaired,Center,12,Mashtots Ave,2,47.0,1,panel,3,2.8


In [67]:
X=df[df.columns.difference(['price'])]
y=df['price']

In [68]:
def find_anomalies(data):
    #define a list to accumlate anomalies and indexes of anomalies
    anomalies = []
    indexes = []
    
    # Set upper and lower limit to 3 standard deviation
    data_std = np.std(data)
    data_mean = np.mean(data)
    anomaly_cut_off = data_std * 3

    lower_limit  = data_mean - anomaly_cut_off 
    upper_limit = data_mean + anomaly_cut_off
    
    # Generate outliers
    for outlier in data:
        if outlier > upper_limit or outlier < lower_limit:
            anomalies.append(outlier)
    
    # Generate indexes of outliers
    for i in anomalies:
        indexes=data[data==i].index
    return indexes
    #return '%s' %data.name, indexes

In [69]:
# Get indexes of outliers for all columns in one list
indexes=[]
for col in X.columns:
    if type(X[col][0])!=str:
        idx=find_anomalies(X[col])
        indexes.extend(idx)
indexes=list(set(indexes))
print(indexes)

[7, 1031, 3599, 3091, 532, 23, 3096, 1562, 27, 1574, 1075, 2611, 574, 2113, 3649, 1605, 2631, 1100, 598, 3159, 2139, 1640, 1648, 4722, 3193, 1150, 129, 1667, 4747, 2190, 3726, 660, 1686, 1687, 4760, 3752, 3754, 3244, 3249, 2738, 3250, 4787, 3776, 2759, 4809, 4810, 2763, 1740, 3795, 4312, 3289, 4316, 3804, 742, 236, 3309, 2801, 4338, 3832, 1272, 3324, 3840, 4352, 3332, 2318, 4880, 1304, 1309, 3360, 3896, 3902, 3390, 4416, 2372, 845, 3919, 1874, 345, 350, 4450, 3434, 2927, 4464, 4465, 4470, 2422, 3446, 4477, 2942, 385, 899, 1928, 397, 4504, 412, 4528, 1969, 4019, 2996, 4027, 2494, 2495, 3519, 1991, 4040, 458, 1995, 4051, 4054, 1497, 3550, 4070, 2552, 4092]


In [70]:
# Drop outliers and reindex
df.drop(axis=0, index=indexes, inplace=True)
df.index=range(len(df))

In [71]:
y=df['price']
X=pd.get_dummies(df[df.columns.difference(['price'])])

In [72]:
#define model
model = Ridge()
#define model evaluation method
cv=KFold(n_splits=10,shuffle=True,random_state=1)
#define grid
alphas={'alpha':[0.001,0.01,0.03,0.05,0.07,0.09,0.1,0.5,0.7,0.9,0.99,1,1.1,1.3,1.4,1.5,2,3,5,7,9,15,20,50]}
grid=GridSearchCV(model,alphas, scoring="neg_root_mean_squared_error", cv=cv)
result = grid.fit(X, y)
print('RMSE: %.3f' % result.best_score_)
print('Config: %s' % result.best_params_)

RMSE: -23476.446
Config: {'alpha': 1.4}


In [73]:
predictions_train = cross_val_predict(Ridge(alpha=1.4),X, y, cv=10)
print('R-squared for train data: %.2f'
      % r2_score(y, predictions_train))
print('Root mean squared error for train data: %.2f'
      % np.sqrt(mean_squared_error(y, predictions_train)))

R-squared for train data: 0.78
Root mean squared error for train data: 23437.74


### DATA PREPROCESSING FOR TEST DATA

In [None]:
df_test=pd.read_csv('')

In [149]:
#for removing the values from test data that are missing in train data(=df)
def remove_missing_values(test_data, train_data):   
    indexes=[]
    n_columns=0
    for col in test_data.columns:
        if type(train_data[col][0])==str:
            n_columns=n_columns+1
            for i in test_data[col].unique():
                if i in train_data[col].unique():
                    indexes.extend(test_data[col][test_data[col] == i].index)
    idx=[]
    for ix in indexes:
        if indexes.count(ix)==n_columns:
            idx.append(ix)
    idx=list(set(idx))
    test_data=test_data.loc[idx]
    print(test_data.head())

In [None]:
remove_missing_values(df_test, df)

In [None]:
X_test=df_test.iloc[:,1:]
X_test=pd.get_dummies(X_test)
y_test=df_test.iloc[:,0:1]

In [None]:
predictions = cross_val_predict(Ridge(alpha=1.5),X_test, y_test, cv=10)
print('R-squared for train data: %.2f'
      % r2_score(y, predictions))
print('Root mean squared error for train data: %.2f'
      % np.sqrt(mean_squared_error(y, predictions)))