In [7]:
import pandas as pd
import numpy as np

diamonds = pd.read_csv('diamonds.csv')
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [11]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [13]:
from sklearn.model_selection import train_test_split

diamonds_train, diamonds_test = train_test_split(diamonds, random_state=42, test_size=0.15)
print(diamonds_train.head())

       carat        cut color clarity  depth  table  price     x     y     z
13713   0.30      Ideal     E     VS2   62.3   56.0    603  4.27  4.30  2.67
3481    0.81      Ideal     G     VS2   61.5   55.0   3397  6.00  6.06  3.71
343     0.71  Very Good     E     VS2   64.0   57.0   2804  5.66  5.68  3.63
22822   1.55  Very Good     E     SI1   62.4   58.0  10851  7.36  7.42  4.61
51658   0.30      Ideal     G     VS2   61.2   55.0    545  4.35  4.38  2.67


In [14]:
diamonds_features = diamonds.drop('price', axis=1)
num_attribs = diamonds_features.drop(['cut', 'color', 'clarity'], axis=1).columns
cat_attribs = diamonds_features.drop(num_attribs, axis=1).columns

print(num_attribs)
print(cat_attribs)

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Index(['cut', 'color', 'clarity'], dtype='object')


In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

num_pipeline = Pipeline([
 ('selector', DataFrameSelector(num_attribs)),
 ('std_scaler', StandardScaler()),
 ])

cat_pipeline = Pipeline([
 ('selector', DataFrameSelector(cat_attribs)),
 ('1hot_encoder', OneHotEncoder()),
 ])

full_pipeline = FeatureUnion(transformer_list=[
 ("num_pipeline", num_pipeline),
 ("cat_pipeline", cat_pipeline),
 ]) 

In [17]:
diamonds_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45849 entries, 13713 to 15795
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    45849 non-null  float64
 1   cut      45849 non-null  object 
 2   color    45849 non-null  object 
 3   clarity  45849 non-null  object 
 4   depth    45849 non-null  float64
 5   table    45849 non-null  float64
 6   price    45849 non-null  int64  
 7   x        45849 non-null  float64
 8   y        45849 non-null  float64
 9   z        45849 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.8+ MB


In [18]:
X_train = full_pipeline.fit_transform(diamonds_train)
y_train = diamonds_train['price']

X_train

<45849x26 sparse matrix of type '<class 'numpy.float64'>'
	with 412641 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [20]:
from sklearn.metrics import mean_absolute_error

y_predicted = forest_reg.predict(X_train)
mean_absolute_error(y_predicted, y_train)

101.06117850227471

In [21]:
X_test = full_pipeline.transform(diamonds_test)
y_test = diamonds_test['price'].values

y_predicted = forest_reg.predict(X_test)
mean_absolute_error(y_predicted, y_test)

265.42200216927296

In [1]:
print('predicted: ', list(y_predicted[-5:]))
print('actual: ', y_test[-5:])

NameError: ignored

In [None]:
from sklearn.svm import LinearSVC

linearsvc_reg = LinearSVC(penalty='l2', C=1.0, dual=False)
linearsvc_reg.fit(X_train, y_train)