In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("bike_buyers.csv")

In [4]:
df = df.dropna()

In [5]:
df["Purchased Bike"] = df["Purchased Bike"].apply(lambda x: 1 if x == "Yes" else 0)

In [6]:
df["Home Owner"] = df["Home Owner"].apply(lambda x: 1 if x == "Yes" else 0)

In [7]:
# Define a mapping for Education levels
education_mapping = {
    'Bachelors': 3,
    'Partial College': 2,
    'High School': 1,
    'Graduate Degree': 4,
    'Some High School': 1,
    'No Education': 0
}

df['Education_Encoded'] = df['Education'].apply(lambda x: education_mapping.get(x, -1))
# Display the updated DataFrame
print(df[['Education', 'Education_Encoded']])

           Education  Education_Encoded
0          Bachelors                  3
1    Partial College                  2
2    Partial College                  2
4          Bachelors                  3
5    Partial College                  2
..               ...                ...
994  Partial College                  2
995      High School                  1
996  Graduate Degree                  4
998        Bachelors                  3
999      High School                  1

[952 rows x 2 columns]


In [8]:
df.drop('Education', axis=1, inplace=True)

In [9]:
y = df["Purchased Bike"]
X = df[["Income", "Age", "Education_Encoded", "Home Owner"]]

In [10]:
X

Unnamed: 0,Income,Age,Education_Encoded,Home Owner
0,40000.0,42.0,3,1
1,30000.0,43.0,2,1
2,80000.0,60.0,2,0
4,30000.0,36.0,3,0
5,10000.0,50.0,2,1
...,...,...,...,...
994,80000.0,46.0,2,1
995,60000.0,54.0,1,1
996,70000.0,35.0,4,1
998,100000.0,38.0,3,0


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [13]:
X_train.shape

(761, 4)

In [14]:
X.shape[0] * 0.8

761.6

In [15]:
X.shape

(952, 4)

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
param_grid = {
    "splitter" : ["best", "random"],
    "max_depth" : [None, 10, 20, 30, 40, 50],
    "min_samples_split" : [2,5,10],
    "min_samples_leaf" : [1,2,4]
}

In [20]:
tree_model = DecisionTreeClassifier()

In [21]:
grid_search_tree = GridSearchCV(tree_model, param_grid)

In [22]:
grid_search_tree.fit(X_train, y_train)

In [23]:
grid_search_tree.best_params_

{'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}

In [24]:
pred_tree = grid_search_tree.predict(X_test)

In [25]:
accuracy_score(y_test, pred_tree)

0.6335078534031413

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rfc_model = RandomForestClassifier()

In [28]:
param_grid_rfc = {
    "max_features" : [2,3,4],
    "n_estimators" : [32,64,128,256],
}

In [29]:
grid_rfc = GridSearchCV(rfc_model, param_grid_rfc)

In [30]:
grid_rfc.fit(X_train, y_train)

In [31]:
grid_rfc.best_params_

{'max_features': 2, 'n_estimators': 128}

In [32]:
pred_rfc = grid_rfc.predict(X_test)

In [33]:
accuracy_score(y_test, pred_rfc)

0.6230366492146597

In [34]:
grid_search_tree

In [35]:
import joblib
joblib.dump(grid_search_tree, "model.pkl")

['model.pkl']

In [36]:
X.columns

Index(['Income', 'Age', 'Education_Encoded', 'Home Owner'], dtype='object')