In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,explained_variance_score
from sklearn.model_selection import cross_val_score
train_df = pd.read_csv('datos limpios')

In [2]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_2,color_2,clarity_2
0,0,0,1.12,61.6,59.0,6.67,6.63,4.1,5363,3,4,2
1,1,1,1.14,60.0,54.0,6.74,6.97,4.11,5593,4,1,3
2,2,2,0.9,60.3,63.0,6.12,6.22,3.72,3534,4,0,3
3,3,3,0.71,61.9,54.0,5.74,5.76,3.56,3212,2,1,4
4,4,4,0.34,60.0,62.0,4.51,4.55,2.72,447,4,2,3


In [3]:
# Ground Truth
y = train_df["price"]

# Features
X = train_df[["carat","x","y","z","color_2","table", "depth"]]

## Datos Standarizados & normalizados

Una vez definido el Ground truth se realiza una estandarización de los datos y una normalización,para que todas las variables tengan el mismo peso.

In [4]:
pipeline = [
    StandardScaler(),
    Normalizer()
]

transformer = make_pipeline(*pipeline)

X_data = transformer.fit_transform(X)

X_data = pd.DataFrame(X_data, columns=X.columns)
X_data

Unnamed: 0,carat,x,y,z,color_2,table,depth
0,0.359129,0.442777,0.411213,0.418538,0.438222,0.368500,-0.053614
1,0.256414,0.319837,0.381466,0.286362,-0.334001,-0.552338,-0.433363
2,0.067963,0.109637,0.133204,0.080834,-0.483440,0.789611,-0.319192
3,-0.101660,0.004283,0.011762,0.016433,-0.513749,-0.849589,0.059155
4,-0.300966,-0.339151,-0.320741,-0.359439,-0.109030,0.637452,-0.379596
...,...,...,...,...,...,...,...
40450,0.096258,0.159276,0.138065,0.228144,-0.684708,0.110483,0.642824
40451,-0.318846,-0.352279,-0.341516,-0.336985,-0.515631,-0.523989,0.060042
40452,0.078667,0.198715,0.208252,0.093564,0.087565,-0.403911,-0.855213
40453,0.360373,0.332430,0.340493,0.404231,0.343652,-0.271017,0.537343


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y,test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32364, 7) (8091, 7) (32364,) (8091,)


In [6]:
model = RandomForestRegressor(n_estimators=500,max_features="auto",max_depth=35,n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Se usan las siguientes métricas para la evaluación del módelo de regresión;

*R2_score-->(-infinito-1) Cuanto más se parezca a 1,mejor se ajustará nuestro modelo.
*MSE--> error promedio de nuestras predicciones,cuanto mayor sea,peor es nuestro modelo.
*MAE--> error promedio de las diferencias absolutas entre valores objetivos/predicción,cuanto menor sea mejor.

In [7]:
print("R2_score",r2_score(y_test, y_pred))
print("Mean squared error",mean_squared_error(y_test, y_pred)**.5)
print("Mean absolute error",mean_absolute_error(y_test, y_pred))
print("explained variance score",explained_variance_score(y_test, y_pred))

R2_score 0.9088984879613355
Mean squared error 1188.8619936713453
Mean absolute error 642.933737466111
explained variance score 0.9089725400060279


In [8]:
scores = cross_val_score(model,X,y, cv=10)
print(np.mean(scores))

0.9125679094373919


## Datos sin Standarizar & Normalizar

In [9]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_2,color_2,clarity_2
0,0,0,1.12,61.6,59.0,6.67,6.63,4.1,5363,3,4,2
1,1,1,1.14,60.0,54.0,6.74,6.97,4.11,5593,4,1,3
2,2,2,0.9,60.3,63.0,6.12,6.22,3.72,3534,4,0,3
3,3,3,0.71,61.9,54.0,5.74,5.76,3.56,3212,2,1,4
4,4,4,0.34,60.0,62.0,4.51,4.55,2.72,447,4,2,3


In [10]:
# Ground Truth
y = train_df["price"]

# Features
X = train_df[["carat","x","y","z","color_2","table", "depth"]]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y,test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32364, 7) (8091, 7) (32364,) (8091,)


In [12]:
model = RandomForestRegressor(n_estimators=500,max_features="auto",max_depth=35,n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [13]:
print("R2_score",r2_score(y_test, y_pred))
print("Mean squared error",mean_squared_error(y_test, y_pred)**.5)
print("Mean absolute error",mean_absolute_error(y_test, y_pred))
print("explained variance score",explained_variance_score(y_test, y_pred))

R2_score 0.9133331824312855
Mean squared error 1173.59933133906
Mean absolute error 643.1676563951927
explained variance score 0.9133780519944383


In [14]:
scores = cross_val_score(model,X,y, cv=10)
print(np.mean(scores))

0.9126656484714235


En ambos casos,parece que no influye demasiado la standarización y la normalización,seguramente porque los datos estan lo suficientemente "compensados".En este caso,para unos mismos hyperparametros se podría observar un MSE en el caso de los standarizados,aunque la diferencia no es demasiado grande.En cuando al entrenamiento con el cross al 80% es practicamente igual.

## Probando con las variables que más peso tienen

In [15]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_2,color_2,clarity_2
0,0,0,1.12,61.6,59.0,6.67,6.63,4.1,5363,3,4,2
1,1,1,1.14,60.0,54.0,6.74,6.97,4.11,5593,4,1,3
2,2,2,0.9,60.3,63.0,6.12,6.22,3.72,3534,4,0,3
3,3,3,0.71,61.9,54.0,5.74,5.76,3.56,3212,2,1,4
4,4,4,0.34,60.0,62.0,4.51,4.55,2.72,447,4,2,3


In [16]:
# Ground Truth
y = train_df["price"]

# Features
X = train_df[["carat","x","y","z"]]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y,test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32364, 7) (8091, 7) (32364,) (8091,)


In [18]:
model = RandomForestRegressor(n_estimators=500,max_features="auto",max_depth=35,n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [19]:
print("R2_score",r2_score(y_test, y_pred))
print("Mean squared error",mean_squared_error(y_test, y_pred)**.5)
print("Mean absolute error",mean_absolute_error(y_test, y_pred))
print("explained variance score",explained_variance_score(y_test, y_pred))

R2_score 0.9043853430070079
Mean squared error 1217.3872521689634
Mean absolute error 655.8931954374523
explained variance score 0.9043853717746473


In [20]:
scores = cross_val_score(model,X,y, cv=10)
print(np.mean(scores))

0.8685986289594133


En este caso se observa que aunque estas variables no tengan casi peso,según la matriz de correlación si que influyen bastante en nuestro modelo.Es necesario el uso de estas.

## Conclusión

Usaremos los datos sin standarizar y nomalizar,ya que parece que no influye demasiado en este caso,que los datos son bastante homogeneos.Y todas las variables.