-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
100 lines (74 loc) · 2.89 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import skops.io as sio
from prefect import flow, task
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
@task
def load_data(filename: str):
bank_df = pd.read_csv(filename, index_col="id", nrows=1000)
bank_df = bank_df.drop(["CustomerId", "Surname"], axis=1)
bank_df = bank_df.sample(frac=1)
return bank_df
@task
def preprocessing(bank_df: pd.DataFrame):
cat_col = [1, 2]
num_col = [0, 3, 4, 5, 6, 7, 8, 9]
# Filling missing categorical values
cat_impute = SimpleImputer(strategy="most_frequent")
bank_df.iloc[:, cat_col] = cat_impute.fit_transform(bank_df.iloc[:, cat_col])
# Filling missing numerical values
num_impute = SimpleImputer(strategy="median")
bank_df.iloc[:, num_col] = num_impute.fit_transform(bank_df.iloc[:, num_col])
# Encode categorical features as an integer array.
cat_encode = OrdinalEncoder()
bank_df.iloc[:, cat_col] = cat_encode.fit_transform(bank_df.iloc[:, cat_col])
# Scaling numerical values.
scaler = MinMaxScaler()
bank_df.iloc[:, num_col] = scaler.fit_transform(bank_df.iloc[:, num_col])
return bank_df
@task
def data_split(bank_df: pd.DataFrame):
# Splitting data into training and testing sets
X = bank_df.drop(["Exited"], axis=1)
y = bank_df.Exited
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=125
)
return X_train, X_test, y_train, y_test
# Identify numerical and categorical columns
@task
def train_model(X_train, X_test, y_train):
# Selecting the best features
KBest = SelectKBest(chi2, k="all")
X_train = KBest.fit_transform(X_train, y_train)
X_test = KBest.transform(X_test)
# Train the model
model = LogisticRegression(max_iter=1000, random_state=125)
model.fit(X_train, y_train)
return model
@task
def get_prediction(X_test, model: LogisticRegression):
return model.predict(X_test)
@task
def evaluate_model(y_test, prediction: pd.DataFrame):
accuracy = accuracy_score(y_test, prediction)
f1 = f1_score(y_test, prediction, average="macro")
print("Accuracy:", str(round(accuracy, 2) * 100) + "%", "F1:", round(f1, 2))
@task
def save_model(model: LogisticRegression):
sio.dump(model, "bank_model.skops")
@flow(log_prints=True)
def ml_workflow(filename: str = "train.csv"):
data = load_data(filename)
prep_data = preprocessing(data)
X_train, X_test, y_train, y_test = data_split(prep_data)
model = train_model(X_train, X_test, y_train)
predictions = get_prediction(X_test, model)
evaluate_model(y_test, predictions)
save_model(model)
if __name__ == "__main__":
ml_workflow()