# Import library and files

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

# Get the filename
filename = next(iter(uploaded))

# Read the CSV file into a DataFrame
house_data_df = pd.read_csv(filename)

Saving housing_iteration_3_classification.csv to housing_iteration_3_classification (1).csv


In [None]:
house_df = house_data_df.copy()

In [None]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
 10  MSZoning      1460 non-null   object 
 11  Condition1    1460 non-null   object 
 12  Heating       1460 non-null   object 
 13  Street        1460 non-null   object 
 14  CentralAir    1460 non-null   object 
 15  Foundation    1460 non-null   object 
dtypes: float64(1), int64(9), object(6)
memory usage: 182.6+ KB


In [None]:
house_df = house_df.rename(columns=lambda x: x.strip())

In [None]:
pd.set_option('display.max_colwidth', 1000)

# creating pipline model

## initiate y and X

In [None]:
y = house_df.pop("Expensive")

In [None]:
print(y.tolist())

In [None]:
X = house_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# 1. Split the data into features (X) and target (y)

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', RobustScaler())  # Scale features to a range
])

categoric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

# 3. Define the pipeline with an imputer, RobustScaler, OneHotEncoder, and DecisionTreeClassifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())  # Decision tree classifier
])

# 4. Define the parameter grid for RandomizedSearchCV
param_grid = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# 5. Perform RandomizedSearchCV
grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 6. Make predictions on the test set
y_pred = grid_search.predict(X_test)

# 7. Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# 8. Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Test Accuracy: 0.9212328767123288
Best Parameters: {'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 4, 'classifier__max_depth': 5}
Best Score: 0.9135138109387035


# Nami 1 model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# 1. Split the data into features (X) and target (y)

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

# it is better to always use copy() to be in the same zone
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()
from sklearn.ensemble import RandomForestClassifier

# Define preprocessing pipelines for numeric and categorical features
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', RobustScaler())  # Scale features to a range
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Impute missing values with a constant
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Apply OneHotEncoder to all remaining categorical columns
])

# Combine the numeric and categorical preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, X_num.columns),
        ("cat", categorical_pipeline, X_cat.columns)  # Use X_cat.columns to select all remaining categorical columns
    ]
)

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Random Forest classifier
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],  # Adjust max_features
    'classifier__bootstrap': [True, False]
}

# Perform RandomizedSearchCV for hyperparameter tuning
grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_train)

# Evaluate the model on the training set
accuracy = accuracy_score(y_train, y_pred)
print("Training Accuracy:", accuracy)

# Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Training Accuracy: 0.9674657534246576
Best Parameters: {'classifier__n_estimators': 300, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'sqrt', 'classifier__max_depth': None, 'classifier__bootstrap': True}
Best Score: 0.9289424452514581


In [None]:
# 6. Make predictions on the test set
y_pred = grid_search.predict(X_test)

# 7. Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# 8. Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
uploaded = files.upload()

# Get the filename
filename = next(iter(uploaded))

# Read the CSV file into a DataFrame
house_test_df = pd.read_csv(filename)

Saving housing_classification_competition_test_set.csv to housing_classification_competition_test_set (1).csv


In [None]:
house_test = house_test_df.copy()

In [None]:
house_test.info()

In [None]:
house_test["Expensive"] =  grid_search.predict(house_test.drop(["Id"], axis=1))

house_test["Expensive"].sample(10)

In [None]:

from google.colab import drive
drive.mount('/content/drive')
# Assuming df is your DataFrame
house_test[["Id", "Expensive"]].to_csv('/content/drive/My Drive/House_prices_model.csv', index=False)


Mounted at /content/drive


# Nami model 2

In [None]:
import numpy as np

# Convert X_train numpy array back to a pandas DataFrame
X_train_df = pd.DataFrame(X_train, columns=X.columns)

# Ensure X_train_df is a pandas DataFrame
print(type(X_train_df))

# Define preprocessing pipelines for numeric and categorical features
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', RobustScaler())  # Scale features to a range
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Impute missing values with a constant
    ('preprocessor', ColumnTransformer([
        ('ordinal_encoder', OrdinalEncoder(categories=[zoning_order, heating_order, central_air_order, foundation_order]), ['MSZoning', 'Heating', 'CentralAir', 'Foundation']),
        ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'), X_train_df.columns.difference(['MSZoning', 'Heating', 'CentralAir', 'Foundation']))
    ], remainder='passthrough'))
])

# Combine the numeric and categorical preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, X_num.columns),
        ("cat", categorical_pipeline, X_cat.columns)
    ]
)

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Random Forest classifier
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 15],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],  # Adjust max_features
    'classifier__bootstrap': [True, False]
}

# Perform RandomizedSearchCV for hyperparameter tuning
grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train_df, y_train)

y_pred = grid_search.predict(X_train_df)

# Evaluate the model on the training set
accuracy = accuracy_score(y_train, y_pred)
print("Training Accuracy:", accuracy)

# Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1460 entries, 0 to 1459
Series name: Expensive
Non-Null Count  Dtype
--------------  -----
1460 non-null   int64
dtypes: int64(1)
memory usage: 11.5 KB


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Assuming you have already defined house_df and y_expensive

# Extract the target variable y and features X from house_df
# y = house_df.pop("Expensive")
# X = house_df

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)
X_train = pd.DataFrame(X_train)

X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# Define the order of categories for MSZoning and other categorical features
zoning_order = [["A", "I", "C", "RH", "RM", "RP", "RL", "FV"]]
heating_order = ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall']
central_air_order = ['N', 'Y']
foundation_order = ['Slab', 'BrkTil', 'CBlock', 'Wood', 'Stone', 'PConc']

# Define preprocessing pipelines for numeric and categorical features
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', RobustScaler())  # Scale features to a range
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Impute missing values with a constant
    ('preprocessor', ColumnTransformer([
        ('ordinal_encoder', OrdinalEncoder(categories=[zoning_order, heating_order, central_air_order, foundation_order]), ['MSZoning', 'Heating', 'CentralAir', 'Foundation']),
        ('onehot_encoder', OneHotEncoder(sparse=False ,handle_unknown='ignore'), X_train.columns.difference(['MSZoning', 'Heating', 'CentralAir', 'Foundation']))
    ], remainder='passthrough'))
])

# Combine the numeric and categorical preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, X_num.columns),
        ("cat", categorical_pipeline, X_cat.columns)
    ]
)

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Random Forest classifier
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 15],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],  # Adjust max_features
    'classifier__bootstrap': [True, False]
}

# Perform RandomizedSearchCV for hyperparameter tuning
grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Evaluate the best model on the training set
y_pred_train = grid_search.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", accuracy_train)

# Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
set_config(display="diagram")
set_config(transform_output="pandas")

In [None]:
print("X_train type:", type(X_train))  sparse=Fals
print("X_cat type:", type(X_cat))
print("X_num type:", type(X_num))
print("y_train type:", type(y_train))

X_train type: <class 'pandas.core.frame.DataFrame'>
X_cat type: <class 'pandas.core.frame.DataFrame'>
X_num type: <class 'pandas.core.frame.DataFrame'>
y_train type: <class 'pandas.core.series.Series'>


# Nami Model 3