# Importing the Modules

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='whitegrid')

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Importing the data

**Data from https://www.kaggle.com/c/diamonds-price**

In [2]:
diamond_data = pd.read_csv('../data/train.csv')

In [3]:
diamond_data.shape

(40455, 11)

In [4]:
diamond_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.53,Very Good,G,SI1,63.4,54.0,5.09,5.13,3.24,7.057
1,1,0.41,Ideal,D,SI1,63.0,56.0,4.8,4.75,3.01,6.824
2,2,0.32,Ideal,I,VS2,61.6,56.0,4.37,4.39,2.7,6.107
3,3,0.31,Ideal,H,VVS2,61.2,56.0,4.34,4.37,2.66,6.39
4,4,1.35,Premium,J,VS2,60.5,56.0,7.19,7.12,4.33,8.741


In [5]:
diamond_data = diamond_data.drop(["id"],axis=1)
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.53,Very Good,G,SI1,63.4,54.0,5.09,5.13,3.24,7.057
1,0.41,Ideal,D,SI1,63.0,56.0,4.8,4.75,3.01,6.824
2,0.32,Ideal,I,VS2,61.6,56.0,4.37,4.39,2.7,6.107
3,0.31,Ideal,H,VVS2,61.2,56.0,4.34,4.37,2.66,6.39
4,1.35,Premium,J,VS2,60.5,56.0,7.19,7.12,4.33,8.741


In [6]:
diamond_data[['x','y','z']] = diamond_data[['x','y','z']].replace(0,np.NaN)
diamond_data.isnull().sum()

carat       0
cut         0
color       0
clarity     0
depth       0
table       0
x           7
y           6
z          15
price       0
dtype: int64

In [7]:
diamond_data.dropna(inplace=True)
diamond_data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [8]:
diamond_data['volume'] = diamond_data['x']*diamond_data['y']*diamond_data['z']
diamond_data.drop(['x', 'y', 'z'], axis=1, inplace=True)
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,volume
0,0.53,Very Good,G,SI1,63.4,54.0,7.057,84.601908
1,0.41,Ideal,D,SI1,63.0,56.0,6.824,68.628
2,0.32,Ideal,I,VS2,61.6,56.0,6.107,51.79761
3,0.31,Ideal,H,VVS2,61.2,56.0,6.39,50.449028
4,1.35,Premium,J,VS2,60.5,56.0,8.741,221.664824


In [9]:
def outliers(var):
    a = []
    q1 = diamond_data[var].quantile(.25)
    q2 = diamond_data[var].quantile(.5)
    q3 = diamond_data[var].quantile(.75)
    iqr = q3-q1
    ulim = float(q3+(1.5*iqr))
    llim = float(q1-(1.5*iqr))

    for i in diamond_data[var]:
        if i > ulim:
            i = np.NaN
        elif i < llim:
            i = np.NaN
        else:
            i=i
        a.append(i)
    return a

for col in diamond_data.select_dtypes(exclude='object').columns:
    diamond_data[col] = outliers(col)

In [10]:
for i in diamond_data.select_dtypes(exclude='object').columns:
    diamond_data[i] = diamond_data[i].fillna(diamond_data[i].mean())

In [11]:
diamond_data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
volume     0
dtype: int64

In [12]:
numerical_cols = diamond_data.select_dtypes(include=np.number).columns.to_list()
categorical_cols = diamond_data.select_dtypes(exclude=np.number).columns.to_list()

In [13]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 

diamond_data['cut'] = label_encoder.fit_transform(diamond_data['cut'])
diamond_data['color'] = label_encoder.fit_transform(diamond_data['color'])
diamond_data['clarity'] = label_encoder.fit_transform(diamond_data['clarity'])
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,volume
0,0.53,4,3,2,63.4,54.0,7.057,84.601908
1,0.41,2,0,2,63.0,56.0,6.824,68.628
2,0.32,2,5,5,61.6,56.0,6.107,51.79761
3,0.31,2,4,7,61.2,56.0,6.39,50.449028
4,1.35,3,6,5,60.5,56.0,8.741,221.664824


In [14]:
X = diamond_data.drop(['price'],1)
y = diamond_data['price']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

In [16]:
regr = LinearRegression()
model = regr.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

0.27978995502949117

In [19]:
rr  = RandomForestRegressor()
rr.fit(X_train,y_train)
y_pred = rr.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.08675268836137623

In [20]:
n_estimators = [int(x) for x in np.linspace(10,200,10)]
max_depth = [int(x) for x in np.linspace(10,100,10)]
min_samples_split = [2,3,4,5,10]
min_samples_leaf = [1,2,4,10,15,20]
random_grid = {'n_estimators':n_estimators,'max_depth':max_depth,
               'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}

random_grid

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'min_samples_split': [2, 3, 4, 5, 10],
 'min_samples_leaf': [1, 2, 4, 10, 15, 20]}

In [32]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               cv = 3)

rf_random.fit(X_train,y_train)
y_pred = rf_random.predict(X_test)

KeyboardInterrupt: 

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
rf = RandomForestRegressor(n_estimators=73,
                         min_samples_split=3,
                         min_samples_leaf=4,
                         max_depth=40)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [29]:
model = KNeighborsRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.12966636993076164

In [None]:
#aplicar mi modelo al dato mejorado

In [30]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,volume
0,0.53,4,3,2,63.4,54.0,84.601908
1,0.41,2,0,2,63.0,56.0,68.628
2,0.32,2,5,5,61.6,56.0,51.79761
3,0.31,2,4,7,61.2,56.0,50.449028
4,1.35,3,6,5,60.5,56.0,221.664824


In [31]:
#1: random forrest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0, max_depth = 100, n_estimators = 100)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.083621129781771

Remove outliers from my dataset

In [53]:
df = pd.read_csv("../data/train.csv")

In [54]:
df.shape

(40455, 11)

In [55]:
df.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [56]:
df = df.loc[(df[['x','y','z']]!=0).all(axis=1)]
df.shape

(40440, 11)

In [57]:
df = df.drop(["id"],axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.53,Very Good,G,SI1,63.4,54.0,5.09,5.13,3.24,7.057
1,0.41,Ideal,D,SI1,63.0,56.0,4.8,4.75,3.01,6.824
2,0.32,Ideal,I,VS2,61.6,56.0,4.37,4.39,2.7,6.107
3,0.31,Ideal,H,VVS2,61.2,56.0,4.34,4.37,2.66,6.39
4,1.35,Premium,J,VS2,60.5,56.0,7.19,7.12,4.33,8.741


In [58]:
df['volume'] = df['x']*df['y']*df['z']

In [65]:
numerical_cols = df.select_dtypes(include=np.number).columns.to_list()

In [66]:
numerical_cols.remove("x")
numerical_cols.remove("y")
numerical_cols.remove("z")
numerical_cols

['carat', 'depth', 'table', 'price', 'volume']

In [67]:
def outliers(var):
    a = []
    q1 = diamond_data[var].quantile(.25)
    q2 = diamond_data[var].quantile(.5)
    q3 = diamond_data[var].quantile(.75)
    iqr = q3-q1
    ulim = float(q3+(1.5*iqr))
    llim = float(q1-(1.5*iqr))

    for i in diamond_data[var]:
        if i > ulim:
            i = np.NaN
        elif i < llim:
            i = np.NaN
        else:
            i=i
        a.append(i)
    return a

for col in numerical_cols:
    df[col] = outliers(col)

In [69]:
for i in numerical_cols:
    df[i] = df[i].fillna(diamond_data[i].mean())

In [46]:
numerical_cols = df.select_dtypes(include=np.number).columns.to_list()
categorical_cols = df.select_dtypes(exclude=np.number).columns.to_list()

In [70]:
encoded = pd.get_dummies(df)
encoded.head(5)

Unnamed: 0,carat,depth,table,x,y,z,price,volume,cut_Fair,cut_Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,84.601908,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,68.628,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,51.79761,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,50.449028,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,221.664824,0,0,...,0,1,0,0,0,0,0,1,0,0


In [82]:
encoded.drop(["volume"],axis=1, inplace=True)
X = encoded.drop(['price'],1)
y = encoded['price']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [84]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0, max_depth = 100, n_estimators = 100)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [85]:
mean_absolute_error(y_test, y_pred)

0.06527765991385119

In [78]:
rr  = RandomForestRegressor()
rr.fit(X_train,y_train)
y_pred = rr.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.069008431558413

In [80]:
n_estimators = [int(x) for x in np.linspace(10,200,10)]
max_depth = [int(x) for x in np.linspace(10,100,10)]
min_samples_split = [2,3,4,5,10]
min_samples_leaf = [1,2,4,10,15,20]
random_grid = {'n_estimators':n_estimators,'max_depth':max_depth,
               'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}


rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               cv = 3)

rf_random.fit(X_train,y_train)
y_pred = rf_random.predict(X_test)

mean_absolute_error(y_test, y_pred)

KeyboardInterrupt: 

In [81]:
rf = RandomForestRegressor(n_estimators=73,
                         min_samples_split=3,
                         min_samples_leaf=4,
                         max_depth=40)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.06739524804550505