In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('furniture.csv', names=['item_id','name','category','old_price','sellable_online'
                            ,'link','other_colors','short_description','designer','depth'
                            ,'height','width','price'],skiprows=1, header=None)


In [3]:
categs = ['category','sellable_online','other_colors','depth','height','width','price']
furniture_data = df[categs]

In [4]:
furniture_data.isnull().sum()

category              0
sellable_online       0
other_colors          0
depth              1463
height              988
width               589
price                 0
dtype: int64

In [4]:
col_names = furniture_data.columns
for c in col_names:
    furniture_data = furniture_data.replace("?", np.NaN)
furniture_data = furniture_data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [6]:
furniture_data.isnull().sum()

category           0
sellable_online    0
other_colors       0
depth              0
height             0
width              0
price              0
dtype: int64

In [5]:
cat_col=['category','sellable_online','other_colors']
labelEncoder = preprocessing.LabelEncoder()
mapping_dict ={}

In [6]:
for col in cat_col:
    furniture_data[col] = labelEncoder.fit_transform(furniture_data[col])
    le_name_mapping = dict(zip(labelEncoder.classes_,labelEncoder.transform(labelEncoder.classes_)))
    mapping_dict[col]= le_name_mapping
print(mapping_dict, "\n" )

{'category': {'Bar furniture': 0, 'Beds': 1, 'Bookcases & shelving units': 2, 'Cabinets & cupboards': 3, 'Café furniture': 4, 'Chairs': 5, 'Chests of drawers & drawer units': 6, "Children's furniture": 7, 'Nursery furniture': 8, 'Outdoor furniture': 9, 'Room dividers': 10, 'Sideboards, buffets & console tables': 11, 'Sofas & armchairs': 12, 'TV & media furniture': 13, 'Tables & desks': 14, 'Trolleys': 15, 'Wardrobes': 16}, 'sellable_online': {False: 0, True: 1}, 'other_colors': {'No': 0, 'Yes': 1}} 



In [7]:
X = furniture_data.drop('price', axis=1)  # features
y = furniture_data['price'] # labels 

In [11]:
X

Unnamed: 0,category,sellable_online,other_colors,depth,height,width
0,0,1,0,40.0,99.0,51.0
1,0,0,0,40.0,105.0,80.0
2,0,0,0,40.0,74.0,60.0
3,0,1,1,50.0,100.0,60.0
4,0,1,0,60.0,43.0,74.0
...,...,...,...,...,...,...
3689,16,1,0,50.0,74.0,91.0
3690,16,1,0,50.0,74.0,135.0
3691,16,1,0,50.0,74.0,175.0
3692,16,1,0,50.0,74.0,178.0


In [9]:
X.dtypes

category             int32
sellable_online      int64
other_colors         int32
depth              float64
height             float64
width              float64
dtype: object

In [10]:
X.category.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 13, 16])

In [12]:
X.sellable_online.unique()

array([1, 0], dtype=int64)

In [13]:
X.other_colors.unique()

array([0, 1])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [11]:
# #### Linear Regression

from sklearn.linear_model import LinearRegression

LRregressor = LinearRegression()
LRregressor.fit(X_train,y_train)

# Calculate the training and test score
print("LRregressor / Train score is :", LRregressor.score(X_train,y_train))
print("LRregressor / Test score is :", LRregressor.score(X_test,y_test))

# Calculate the prediction of the model
ypred_LR = LRregressor.predict(X_test)
print("Prediction of Linear Regression:", ypred_LR)

LRregressor / Train score is : 0.5117190896148673
LRregressor / Test score is : 0.4441165534782626
Prediction of Linear Regression: [1667.39822709 2301.90735864  546.06119476 ...  763.45692838  831.94283947
  371.45651531]


In [12]:
# #### Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

DTregressor = DecisionTreeRegressor(max_depth=10, random_state=1)
DTregressor.fit(X_train,y_train)

# Calculate the training and test score
print("Decision Tree Regressor / Train score is :", DTregressor.score(X_train,y_train))
print("Decision Tree Regressor / Test score is :", DTregressor.score(X_test,y_test))


# Calculate the prediction of the model
ypred_DTr = DTregressor.predict(X_test)
print("Prediction of Decision Tree:", ypred_DTr)

Decision Tree Regressor / Train score is : 0.8681156843708964
Decision Tree Regressor / Test score is : 0.6272993227536424
Prediction of Decision Tree: [1155.7125     2874.125       338.9        ... 1522.49206349  665.
   53.07027027]


In [13]:
# #### Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

RFregressor = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=1)
RFregressor.fit(X_train,y_train)

# Calculate the training and test score
print("Random Forest Regressor / Train score is :", RFregressor.score(X_train,y_train))
print("Random Forest Regressor / Test score is :", RFregressor.score(X_test,y_test))
#print("Random Forest Regressor / N° of features  are : ", RFregressor.n_features_)
print("Important features are : " , RFregressor.feature_importances_)


# Calculate the prediction of the model
ypred_RFr = RFregressor.predict(X_test)
print("Prediction of Random Forest :", ypred_RFr)

Random Forest Regressor / Train score is : 0.5890069382678234
Random Forest Regressor / Test score is : 0.4817184118460528
Important features are :  [3.02612097e-02 0.00000000e+00 2.27563922e-04 7.97193786e-02
 2.10046518e-02 8.68787196e-01]
Prediction of Random Forest : [1354.48782927 2076.0810553   635.71835673 ...  639.84974755  225.92078429
  635.71835673]


In [14]:
from sklearn.svm import SVR

SVMregressor = SVR(kernel='linear')
SVMregressor.fit(X_train, y_train)

# Calculate the training and test score
print("SVM regressor / Train score is :", SVMregressor.score(X_train,y_train))
print("SVM regressor / Test score is :", SVMregressor.score(X_test,y_test))

SVM regressor / Train score is : 0.4639902599583918
SVM regressor / Test score is : 0.3973281417306833


In [15]:
# Calculate the prediction of the model
ypred_SVMr = SVMregressor.predict(X_test)
print("Prediction of SVM :", ypred_SVMr)

Prediction of SVM : [1415.87356967 2051.36220977  580.88569529 ...  441.91700973  804.69055207
  194.73957799]


In [16]:
### Saving the best model ###

# Saving model to disk
pickle.dump(DTregressor, open('model.pkl','wb'))

In [17]:
from bs4 import BeautifulSoup
import requests
url = "https://www.um6p.ma/fr"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(soup.title)

ConnectTimeout: HTTPSConnectionPool(host='www.um6p.ma', port=443): Max retries exceeded with url: /fr (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000272988B0350>, 'Connection to www.um6p.ma timed out. (connect timeout=None)'))