In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split

url = "https://drive.google.com/file/d/1YxeVDZHfDhqWb0VOn-lfxnDKoLOayJeD/view?usp=drive_link" # > Data from the iteration 5
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)


X = data.drop(columns=['Id']).copy()
y = X.pop("Expensive")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
data["Id"]

0          1
1          2
2          3
3          4
4          5
        ... 
1455    1456
1456    1457
1457    1458
1458    1459
1459    1460
Name: Id, Length: 1460, dtype: int64

Explore the data a bit!

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 254 to 1126
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1168 non-null   int64  
 1   LotFrontage    951 non-null    float64
 2   TotalBsmtSF    1168 non-null   int64  
 3   BedroomAbvGr   1168 non-null   int64  
 4   Fireplaces     1168 non-null   int64  
 5   PoolArea       1168 non-null   int64  
 6   GarageCars     1168 non-null   int64  
 7   WoodDeckSF     1168 non-null   int64  
 8   ScreenPorch    1168 non-null   int64  
 9   MSZoning       1168 non-null   object 
 10  Condition1     1168 non-null   object 
 11  Heating        1168 non-null   object 
 12  Street         1168 non-null   object 
 13  CentralAir     1168 non-null   object 
 14  Foundation     1168 non-null   object 
 15  ExterQual      1168 non-null   object 
 16  ExterCond      1168 non-null   object 
 17  BsmtQual       1140 non-null   object 
 18  BsmtCond   

That's a lot of columns! Will all of them be useful?

Don't forget to check the accompanying `.txt` file for additional info.

### The lazy model

In [None]:

# Select the numerical columns from X
X_num = X_train.select_dtypes(include="number").copy()

# Select the categorical columns from X
X_cat = X_train.select_dtypes(exclude="number").copy()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

num_pipe = make_pipeline(
                            SimpleImputer()
                        )

cat_pipe = make_pipeline(
                            SimpleImputer(strategy="constant", fill_value="NA"),
                            OneHotEncoder(handle_unknown="ignore")
                        )

preprocessor = make_column_transformer(
    (num_pipe, X_num.columns),
    (cat_pipe, X_cat.columns)
)

lazy_pipe = make_pipeline(preprocessor,
                          DecisionTreeClassifier()
                          )

lazy_pipe.fit(X_train, y_train)

Since this is a *lazy* model, we won't tune or even test it!

### The test set

Here's even more new data, this time *without labels!*. To see how well our model performs, we'll predict whether these houses are expensive or not and upload the results to the [competition site](https://housingcomp-data023.streamlit.app/).

In [None]:
test_url = "https://drive.google.com/file/d/1MZnPvWoGQtBHij32Rti26C2T0KT1xGBc/view?usp=drive_link"
test_path = 'https://drive.google.com/uc?export=download&id='+test_url.split('/')[-2]
test = pd.read_csv(test_path)
test

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [None]:
data["Id"].tail()

1455    1456
1456    1457
1457    1458
1458    1459
1459    1460
Name: Id, dtype: int64

In [None]:
test["Id"].head()

0    1461
1    1462
2    1463
3    1464
4    1465
Name: Id, dtype: int64

The upload will be a `.csv` file with two columns: "Id" and "Expensive" (the columns __*must*__ have these names and they __*must*__ be in this order).

The resulting file should start off something like this:
> Id,Expensive    
1461,0    
1462,1    
1463,0    
1464,1   
1465,1    
1466,1   

If you have different "Id"s, you've used the wrong file to test your model on.

In [None]:
# the dataframe given to the model must have the same columns as the dataframe it trained on
# "Id" is still needed for the submission, though, so don't drop permanently!!

test["Expensive"] = lazy_pipe.predict(test.drop(["Id"], axis=1))

test["Expensive"]

0       0
1       0
2       0
3       0
4       0
       ..
1454    0
1455    0
1456    1
1457    0
1458    0
Name: Expensive, Length: 1459, dtype: int64

In [None]:

test[["Id", "Expensive"]].to_csv('./lazy_model.csv', index=False)  # don't forget to leave off indexes!

### Competition

In [39]:
from sklearn import set_config
set_config(display="diagram")
set_config(transform_output="pandas")

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')



url = "https://drive.google.com/file/d/1YxeVDZHfDhqWb0VOn-lfxnDKoLOayJeD/view?usp=drive_link" # > Data from the iteration 5
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)




In [41]:
#checking missing values 

data.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 81, dtype: int64

### Splitting Data

In [130]:
# create feature vector
X = data.drop(columns=['Id']).copy()

# target feature
y = X.pop("Expensive")

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=123)

In [131]:
# Select the numerical columns from X
X_num = X_train.select_dtypes(include="number").copy()

# Select the categorical columns from X
X_cat = X_train.select_dtypes(exclude="number").copy()

### Pipes

In [132]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

num_pipe = make_pipeline(
    SimpleImputer()
)

ordinal_cols_names = ["LotShape", "Utilities", "ExterQual", 
                      "ExterCond", "BsmtQual", "BsmtCond", 
                      "BsmtExposure", "BsmtFinType1", "BsmtFinType2", 
                      "HeatingQC", "KitchenQual", "FireplaceQu", 
                      "GarageQual", "GarageCond", "PoolQC"]
ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

non_ordinal_cols_names = [col for col in X_cat.columns if col not in ordinal_cols_names]
onehot_cols = X_cat.columns.get_indexer(non_ordinal_cols_names)

ordinal_rankings = [["NA", "IR3", "IR2","IR1", "Reg"],
                    ["NA", "ELO", "NoSeWa", "NoSewr", "AllPub"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "No", "Mn", "Av", "Gd"],
                    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Fa", "TA", "Gd", "Ex"]] 

cat_preprocessor = make_column_transformer(
    (OrdinalEncoder(categories=ordinal_rankings), ordinal_cols),
    (OneHotEncoder(drop="first", sparse=False, handle_unknown="ignore"), onehot_cols )
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"), # play with NA or N_A 
    cat_preprocessor)

# final preprocessor

preprocessor = make_column_transformer(
    (num_pipe, X_num.columns),
    (cat_pipe, X_cat.columns)
)

In [45]:
preprocessor

### Algo

In [153]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

forest = RandomForestClassifier(random_state=123)

rf_pipeline = make_pipeline(preprocessor,
                            StandardScaler(),
                              forest
)

rf_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy": ["mean", "median"],
    'randomforestclassifier__n_estimators': [100, 200],  # Anzahl der Bäume im Wald
    'randomforestclassifier__max_depth': [15, 20, 25],  # Maximale Tiefe der Bäume
    'randomforestclassifier__min_samples_split': [3, 4, 5],  # Minimale Anzahl der Beobachtungen, die erforderlich sind, um einen Knoten aufzuteilen
    'randomforestclassifier__min_samples_leaf': [1, 2, 3]  # Minimale Anzahl von Beobachtungen in einem Blattknoten
}

rf_search = GridSearchCV(rf_pipeline,
                      rf_param_grid,
                      cv=10,
                      verbose=1,
                      n_jobs=-1)

rf_search.fit(X_train, y_train)

rf_scores = {"random_forest": rf_search.best_score_}

rf_scores

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


{'random_forest': 0.9549904214559388}

In [109]:
rf_pipeline

In [141]:
best_param = rf_search.best_params_

best_param

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 3,
 'randomforestclassifier__n_estimators': 100}

### Import Competition Data

In [148]:
# import competition data
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)
competition_data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


### Predicting y_test and Making a submission CSV

In [149]:
competition_ids = competition_data.pop('Id')
competition_ids

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [150]:
my_submission = pd.DataFrame({"Id": competition_ids})
my_submission

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [151]:
my_submission["Expensive"] = rf_search.predict(competition_data)

In [89]:
my_submission.head(20)

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
5,1466,0
6,1467,0
7,1468,0
8,1469,0
9,1470,0


In [152]:
my_submission.to_csv("rf_15.csv", index=False)