In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('houses_train_new.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [4]:
df['condition'].unique().shape #1

(3,)

In [5]:
df['district'].unique().shape #3

(13,)

In [6]:
df['street'].unique().shape #4

(350,)

In [7]:
df['region'].unique().shape

(1,)

In [8]:
df['url'].unique().shape

(5001,)

In [9]:
df['building_type'].unique().shape #2

(4,)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5001 non-null   int64  
 1   price           5001 non-null   float64
 2   condition       5001 non-null   object 
 3   district        5001 non-null   object 
 4   max_floor       5001 non-null   int64  
 5   street          5001 non-null   object 
 6   num_rooms       5001 non-null   int64  
 7   region          5001 non-null   object 
 8   area            5001 non-null   float64
 9   url             5001 non-null   object 
 10  num_bathrooms   5001 non-null   int64  
 11  building_type   5001 non-null   object 
 12  floor           5001 non-null   int64  
 13  ceiling_height  5001 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 547.1+ KB


In [11]:
X = df.drop(['price', 'Unnamed: 0',], axis = 1)
y = df.loc[:, df.columns == "price"]
X

Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4996,newly repaired,Arabkir,5,Griboedov St,3,Yerevan,97.0,http://www.myrealty.am/en/item/36852/3-senyaka...,1,stone,4,2.8
4997,newly repaired,Arabkir,4,Orbeli Yeghbayrner St,3,Yerevan,71.0,http://www.myrealty.am/en/item/13933/Apartment...,1,stone,4,2.8
4998,zero condition,Center,5,Mashtots Ave,1,Yerevan,40.0,http://www.myrealty.am/en/item/31190/1-senyaka...,1,stone,2,3.0
4999,newly repaired,Center,14,Argishti St,4,Yerevan,118.0,http://www.myrealty.am/en/item/25905/4-senyaka...,2,monolit,14,3.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1234 )

In [13]:
X_train.head()

Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
894,good,Arabkir,5,V.Papazyan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/34245/3-senyaka...,1,stone,4,2.8
2322,good,Center,4,Leo St,3,Yerevan,97.0,http://www.myrealty.am/en/item/21554/3-senyaka...,2,stone,3,3.2
4592,newly repaired,Arabkir,12,Komitas Ave,2,Yerevan,80.0,http://www.myrealty.am/en/item/36588/2-senyaka...,1,panel,2,2.8
3711,newly repaired,Center,5,Tumanyan St,2,Yerevan,65.0,http://www.myrealty.am/en/item/34959/2-senyaka...,1,stone,2,3.0
4130,newly repaired,Qanaqer-Zeytun,9,Lepsus St,3,Yerevan,78.0,http://www.myrealty.am/en/item/19048/3-senyaka...,1,panel,6,3.0


In [14]:
X_train_num = X_train.select_dtypes(exclude=['object'])
X_train_num['ceiling_height'] = X_train['ceiling_height']
X_test_num = X_test.select_dtypes(exclude=['object'])
X_test_num['ceiling_height'] = X_test['ceiling_height']
X_train_num.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['ceiling_height'] = X_train['ceiling_height']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['ceiling_height'] = X_test['ceiling_height']


Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
894,5,3,80.0,1,4,2.8
2322,4,3,97.0,2,3,3.2
4592,12,2,80.0,1,2,2.8
3711,5,2,65.0,1,2,3.0
4130,9,3,78.0,1,6,3.0


In [15]:
X_train_num.corr()

Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
max_floor,1.0,0.037436,0.158072,0.103681,0.631435,-0.057251
num_rooms,0.037436,1.0,0.748989,0.270376,0.053953,0.062985
area,0.158072,0.748989,1.0,0.415477,0.106946,0.190076
num_bathrooms,0.103681,0.270376,0.415477,1.0,0.05659,0.207212
floor,0.631435,0.053953,0.106946,0.05659,1.0,-0.045996
ceiling_height,-0.057251,0.062985,0.190076,0.207212,-0.045996,1.0


In [16]:
X_train_num['condition'] = X_train['condition']
X_train_num['building_type'] = X_train['building_type']
X_train_num['district'] = X_train['district']
X_train_num['ceiling_height'] = X_train['ceiling_height']

# X.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['condition'] = X_train['condition']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['building_type'] = X_train['building_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['district'] = X_train['district']
A value is trying to be set on a copy of a slice from a

In [17]:
X_train_ohe = pd.get_dummies(X_train_num, prefix=['condition', 'building_type', 'district', 'ceiling_height'], columns=['condition', 'building_type', 'district', 'ceiling_height'])
X_train_ohe.head()

Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,condition_good,condition_newly repaired,condition_zero condition,building_type_monolit,building_type_other,...,district_Nor Norq,district_Norq Marash,district_Qanaqer-Zeytun,district_Shengavit,district_Vahagni district,ceiling_height_2.6,ceiling_height_2.8,ceiling_height_3.0,ceiling_height_3.2,ceiling_height_3.8
894,5,3,80.0,1,4,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2322,4,3,97.0,2,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4592,12,2,80.0,1,2,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3711,5,2,65.0,1,2,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4130,9,3,78.0,1,6,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [18]:
X_test_num['condition'] = X_test['condition']
X_test_num['building_type'] = X_test['building_type']
X_test_num['district'] = X_test['district']
X_test_num['ceiling_height'] = X_test['ceiling_height']

# X.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['condition'] = X_test['condition']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['building_type'] = X_test['building_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['district'] = X_test['district']
A value is trying to be set on a copy of a slice from a DataF

In [19]:
X_test_ohe = pd.get_dummies(X_test_num, prefix=['condition', 'building_type', 'district', 'ceiling_height'], columns=['condition', 'building_type', 'district', 'ceiling_height'])
X_test_ohe.head()

Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,condition_good,condition_newly repaired,condition_zero condition,building_type_monolit,building_type_other,...,district_Nor Norq,district_Norq Marash,district_Nubarashen,district_Qanaqer-Zeytun,district_Shengavit,ceiling_height_2.6,ceiling_height_2.8,ceiling_height_3.0,ceiling_height_3.2,ceiling_height_3.8
2706,10,1,41.0,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2436,9,5,133.0,3,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1201,5,3,90.0,2,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1486,10,4,104.0,1,2,0,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
4286,5,1,42.0,1,3,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [20]:
miss_col_in_test = list(set(X_train_ohe.columns)- set(X_test_ohe.columns))
print(len(miss_col_in_test))
miss_col_in_train = list(set(X_test_ohe.columns)- set(X_train_ohe.columns))
print(len(miss_col_in_train))
print('miss_col_in_train', miss_col_in_train)

1
1
miss_col_in_train ['district_Nubarashen']


In [21]:
X_test_ohe[miss_col_in_test] = 0
X_test_ohe = X_test_ohe.drop(miss_col_in_train, axis = 1)
X_test_ohe.head()
X_test_ohe =X_test_ohe.reindex(columns=X_train_ohe.columns)

In [22]:
deg = 3
poly = PolynomialFeatures(degree = deg, include_bias=False)
X_train_ohe_Poly = poly.fit_transform(X_train_ohe)
X_test_ohe_Poly = poly.fit_transform(X_test_ohe)


In [23]:
target_feature_names_train = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_train_num.columns,p) for p in poly.powers_]]
print('n of columns of target_feature_names_train', len(target_feature_names_train))
X_train_ohe_Poly = pd.DataFrame(X_train_ohe_Poly, columns = target_feature_names_train)

n of columns of target_feature_names_train 4959


In [24]:
target_feature_names_test = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_test_num.columns,p) for p in poly.powers_]]
print('n of columns of target_feature_names_test', len(target_feature_names_test))
X_test_ohe_Poly = pd.DataFrame(X_test_ohe_Poly, columns = target_feature_names_test)

n of columns of target_feature_names_test 4959


In [25]:
X_train_ohe_Poly.head()

Unnamed: 0,max_floor^1,num_rooms^1,area^1,num_bathrooms^1,floor^1,ceiling_height^1,condition^1,building_type^1,district^1,Unnamed: 10,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,5.0,3.0,80.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,3.0,97.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,12.0,2.0,80.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,2.0,65.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.0,3.0,78.0,1.0,6.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
linear = LinearRegression()
linear.fit(X_train_ohe_Poly, y_train)

In [27]:
y_train_pred = linear.predict(X_train_ohe_Poly)

In [28]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 22362.73460447465


In [29]:
y_test_pred = linear.predict(X_test_ohe_Poly)

In [30]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 17516356968.000774


In [31]:
elNet = ElasticNet( alpha=0.001, l1_ratio=0.01)

In [32]:

elNet.fit(X_train_ohe_Poly, y_train)

  model = cd_fast.enet_coordinate_descent(


In [33]:
y_train_pred = elNet.predict(X_train_ohe_Poly)

In [34]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 23000.097720741283


In [35]:
y_test_pred = elNet.predict(X_test_ohe_Poly)

In [36]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 31364.467108531095


In [37]:
grid = dict()
grid['alpha'] = [1e-5,1e-4,1e-3,1e-2,1e-1,1]
grid['l1_ratio'] = [1e-6,1e-5,1e-4,1e-3,1e-2,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
print(grid['l1_ratio'])

[1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


In [None]:
elastic=ElasticNet()

Regressor=GridSearchCV(elastic,grid,scoring='neg_mean_squared_error',cv=5)
Regressor.fit(X_train_ohe_Poly,y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [None]:
y_train_pred = Regressor.predict(X_train_ohe_Poly)

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [None]:
y_test_pred = Regressor.predict(X_test_ohe_Poly)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [None]:
print('best parameter: ', Regressor.best_params_)

In [None]:
elNet = ElasticNet( alpha=0.001, l1_ratio=0.001)

In [None]:

elNet.fit(X_train_ohe_Poly, y_train)

In [None]:
y_train_pred = elNet.predict(X_train_ohe_Poly)


In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [None]:
y_test_pred = elNet.predict(X_test_ohe_Poly)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [None]:
model = ...

In [None]:
# def final_predict(final_test_df):
#     1. preprocessing of final_test_df (scaling, one hot encoding ...)
#     2. make sure that columns and their order in train and test are the same
#     3. return predictions

In [None]:
# df = pd.read_csv('houses_test.csv')
# final_predict(df)