In [121]:
# Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#train
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict, GridSearchCV

#Model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor


#Standart
from sklearn.preprocessing import StandardScaler, normalize, RobustScaler, OneHotEncoder

# from sklearn.preprocessing import PolynomialFeatures


#Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report, roc_auc_score, plot_roc_curve, roc_curve, \
    average_precision_score, precision_recall_curve,plot_precision_recall_curve

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# from sklearn.model_selection import learning_curve

# from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# from sklearn.inspection import permutation_importance
#import warnings
#warnings.simplefilter(action="ignore")

# First Baseline

## Regression lineaire avec outliers

In [122]:
dataset1 = pd.read_csv("DATA/df_encod.csv", sep=',')
data1 = dataset1.drop('Unnamed: 0', axis =1)

In [123]:
# linear regression with all features 
X = data1.drop(['median_house_value'], axis=1)
y = data1['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

model = LinearRegression()

model.fit(X_train, y_train)

score = model.score(X_train, y_train)

y_pred = model.predict(X)

#coef and intercept results
print('Coef:',model.coef_)
print('Intercept:',model.intercept_)
print('Score:',score)

# Predict a Response and print it:
y_pred = model1.predict(X)
print('Predicted response:', y_pred, sep='\n')

Coef: [-2.49531616e+04 -2.42971168e+04  9.78595973e+02 -5.83986271e+00
  8.36263062e+01 -4.46316087e+01  8.24190951e+01  3.86708619e+04
  8.49062113e+03 -3.10943897e+04  2.27373675e-13  7.84156241e+03
  1.47622062e+04]
Intercept: -2092051.329070356
Score: 0.651007169691395
Predicted response:
[ 35430.6490017  317386.949328    60012.92021908 ... 152597.98004801
 178550.81039786 150093.57439964]


# Iteration 2

## Regression lineaire sans outliers

In [124]:
dataset2 = pd.read_csv("DATA/df_encod_sans_outliers.csv", sep=',')
data2 = dataset2.drop('Unnamed: 0', axis =1)

In [125]:
# define X, y
y2 = dataset2['median_house_value']
x2 = data2.drop('median_house_value', axis =1)

In [126]:
# Create an instance of a linear regression model and fit it to the data with the fit() function:
model2 = LinearRegression().fit(x2, y2) 

# The following section will get results by interpreting the created instance: 

# Obtain the coefficient of determination by calling the model with the score() function, then print the coefficient:
r_sq2 = model2.score(x2, y2)
print('coefficient of determination:', r_sq2)

# Print the Intercept:
print('intercept:', model2.intercept_)

# Print the Slope:
print('slope:', model2.coef_) 

# Predict a Response and print it:
y_pred2 = model2.predict(x2)
print('Predicted response:', y_pred2, sep='\n')

coefficient of determination: 0.618593316227131
intercept: -1920466.9825578732
slope: [-2.28541279e+04 -2.10075867e+04  6.91829243e+02 -4.52124019e+00
  5.33700973e+01 -2.76142716e+01  6.25926669e+01  3.47952557e+04
 -1.68186921e+04 -5.73281905e+04  1.13147930e+05 -2.36915525e+04
 -1.53094948e+04]
Predicted response:
[ 48691.86412554 300396.75988997  66669.49002385 ... 151254.00565739
 172019.07684258 150106.19644668]


# Iteration 3

## Random forest avec Outliers

In [127]:
X = data1.drop('median_house_value', axis =1)
y = dataset1['median_house_value']

In [128]:
# standardizing the features (scaling)
X = RobustScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

rf = RandomForestRegressor(n_estimators=10, random_state=20)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

# R2 
print("R2={}".format(rf.score(X_test,y_test)))

# MAE
errors = abs(pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))

# MAPE
mape = 100 * (errors / y_test)
print('Mean Absolute Percentage Error :', round(np.mean(mape), 2), '%.')

R2=0.7672253388020424
Mean Absolute Error: 38447.44
Mean Absolute Percentage Error : 21.75 %.


Le random forest n'est pas sensible aux relation d'ordre 

# Pipeline

In [129]:
data1 

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,location_<1H OCEAN,location_INLAND,location_ISLAND,location_NEAR BAY,location_NEAR OCEAN
0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,0,1,0,0,0
1,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,1,0,0,0,0
2,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,0,1,0,0,0
3,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,1,0,0,0,0
4,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,0,1,0,0,0
16508,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,0,0,0,1,0
16509,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,0,1,0,0,0
16510,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,1,0,0,0,0


In [130]:
numeric_features =  ['housing_median_age','total_rooms','total_bedrooms','population', 'households', 'median_income', 'median_house_value']

In [131]:
# categ_feature =  ['location_<1H OCEAN','location_INLAND','location_ISLAND', 'location_NEAR BAY', 'location_NEAR OCEAN']

In [132]:
numeric_pipeline = make_pipeline((StandardScaler()))

In [133]:
# categ_pipeline =  make_pipeline((OneHotEncoder()))

In [134]:
preprocessor = make_column_transformer((numeric_pipeline, numeric_features), (categ_pipeline, categ_feature ))

In [135]:
model_pipeline1 = make_pipeline(StandardScaler(), LinearRegression())

In [136]:
model_pipeline1.fit(X_train, y_train)

In [137]:
prediction = model_pipeline1.predict(X_test)

In [138]:

# R2 
print("R2={}".format( model_pipeline1.score(X_test,y_test)))

# MAE
errors = abs(prediction - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))

# MAPE
mape = 100 * (errors / y_test)
print('Mean Absolute Percentage Error :', round(np.mean(mape), 2), '%.')

R2=0.6497087025455838
Mean Absolute Error: 50113.53
Mean Absolute Percentage Error : 28.86 %.
