# Model training script

***

### Loading dataset and cleaning it

In [8]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('transactions.csv')

all_columns = df.columns.tolist()
print("Columns in the dataset:", all_columns)


Columns in the dataset: ['id_transaction', 'date_transaction', 'prix', 'departement', 'id_ville', 'ville', 'code_postal', 'adresse', 'type_batiment', 'vefa', 'n_pieces', 'surface_habitable', 'id_parcelle_cadastre', 'latitude', 'longitude', 'surface_dependances', 'surface_locaux_industriels', 'surface_terrains_agricoles', 'surface_terrains_sols', 'surface_terrains_nature']


In [9]:

# Creating a new feature 'prix_m2'
df['prix_m2'] = df['prix'] / df['surface_habitable']

# Filter for Paris data in 2022
paris_df = df[(df.departement == 75) & (df.date_transaction.str.startswith('2022-'))]

# Define features and target variable
X = paris_df[['date_transaction','code_postal', 
               'type_batiment', 'n_pieces', 'surface_habitable', 
              'latitude', 'longitude',]]
y = paris_df['prix_m2']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Create transformers for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply transformations to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a pipeline that first transforms the data and then fits the model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Preprocess the features
X_processed = model_pipeline.fit_transform(X)
# Transform the 'type_batiment' column into boolean columns
X_processed = pd.get_dummies(X_processed, columns=['type_batiment'], drop_first=True)
paris_df.h



Unnamed: 0,id_transaction,date_transaction,prix,departement,id_ville,ville,code_postal,adresse,type_batiment,vefa,...,surface_habitable,id_parcelle_cadastre,latitude,longitude,surface_dependances,surface_locaux_industriels,surface_terrains_agricoles,surface_terrains_sols,surface_terrains_nature,prix_m2
6043985,10040024,2022-01-03,329910.0,75,112,PARIS 12,75012,260 AV DAUMESNIL,Appartement,False,...,30,75112000AX0008,48.836248,2.403751,{},{},{},{},{},10997.0
6043986,10035062,2022-01-03,302400.0,75,102,PARIS 02,75002,52 RUE GRENETA,Appartement,False,...,26,75102000AM0139,48.865604,2.348148,{},{},{},{},{},11630.769231
6043987,10036657,2022-01-03,665000.0,75,117,PARIS 17,75017,26 BD DES BATIGNOLLES,Appartement,False,...,59,75117000CM0036,48.883309,2.324689,{0},{},{},{},{},11271.186441
6043988,10035700,2022-01-03,965000.0,75,105,PARIS 05,75005,31 RUE CENSIER,Appartement,False,...,63,75105000AU0078,48.839969,2.352576,"{0,0,0}",{},{},{},{},15317.460317
6043989,10029380,2022-01-03,200000.0,75,110,PARIS 10,75010,37 RUE DE PARADIS,Appartement,False,...,19,75110000AS0042,48.875213,2.350388,{},{},{},{},{},10526.315789


In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.8, random_state=42)

In [11]:
# import pandas as pd
# from matplotlib import pyplot as plt

# import warnings
# warnings.filterwarnings("ignore")

# df = pd.read_csv('transactions.csv')

# if 'Unnamed: 0' in df.columns:
#     df = df.drop('Unnamed: 0', axis=1)
    
# df['prix_m2'] = df['prix'] / (df['surface_habitable'])

# paris_df = df[(df.departement == 75) & (df.n_pieces == 4) & (df.date_transaction.str.startswith('2022-'))]
# surface_cols = [c for c in paris_df.columns if 'surface_' in c and c != 'surface_habitable']
# for c in surface_cols:
#     paris_df[c + '_sum'] = paris_df[c].apply(lambda x: sum(eval(x)) if 'NULL' not in x else 0)
# paris_df = paris_df[paris_df[[c + '_sum' for c in surface_cols]].sum(axis=1) == 0]

### Data split

In [12]:
# X = paris_df[[ 'longitude', 'latitude']].values
# y = paris_df['prix_m2'].values

# from sklearn.model_selection import train_test_split
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

### Import models + training

In [13]:


params_grid = {
    'DTR': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': (1, 101, 10), 
            'min_samples_split': (2, 21, 2) 
        }
    },
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': (1, 51, 5)  
        }
    },
    'LR': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False],
            'positive': [True, False]
        }
    },
    'RFR': {
        'model': RandomForestRegressor(),
        'params': {
            'max_depth': (10, 110, 10), 
            'min_samples_leaf': (1, 12, 2), 
            'n_estimators': [100, 200, 300, 400, 500] 
        }
    }
}

for model_name, model_config in tqdm(params_grid.items()):
    gs = GridSearchCV(estimator=model_config['model'], param_grid=model_config['params'])
        # gs = GridSearchCV(estimator=model_config['model'], param_grid=model_config['params'], n_jobs=-1)
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    best_params = gs.best_params_
    train_rmse = np.sqrt(mean_squared_error(y_train, best_model.predict(X_train)))
    test_rmse = np.sqrt(mean_squared_error(y_test, best_model.predict(X_test)))
    score = best_model.score(X_test, y_test)
    
    print(f"Model: {model_name}")
    print(f"Optimal params: {best_params}")
    print(f"Train RMSE: {train_rmse}")
    print(f"Test RMSE: {test_rmse}")
    print(f"Model Score: {score}")
    print() 

 25%|██▌       | 1/4 [00:26<01:20, 26.91s/it]

Model: DTR
Optimal params: {'max_depth': 1, 'min_samples_split': 2}
Train RMSE: 15845.205360874448
Test RMSE: 41351.066386107515
Model Score: -0.004491713358389715



 50%|█████     | 2/4 [01:11<01:14, 37.50s/it]

Model: KNN
Optimal params: {'n_neighbors': 51}
Train RMSE: 15688.614098338172
Test RMSE: 41290.19067122226
Model Score: -0.0015363293464745187



 75%|███████▌  | 3/4 [01:12<00:20, 20.63s/it]

Model: LR
Optimal params: {'fit_intercept': False, 'positive': False}
Train RMSE: 15650.064732560062
Test RMSE: 41352.71031991666
Model Score: -0.004571583158974102

