In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

# Load the data
data = pd.read_csv('Churn_Modelling.csv')

# Estimated Salary will be the output feature

# Drop irrelevant columns
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Check for missing values
if data.isnull().any().any():
    print("数据包含缺失值，请处理缺失值。") 
    data = data.dropna()
else:
    print("数据不包含缺失值。")
    
## Encode categorical data
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
onehot_encoder_geo = OneHotEncoder()
geo_encoded = onehot_encoder_geo.fit_transform(data[['Geography']]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(['Geography'])) 

# combine one-hot encoded columns with the original data
data = pd.concat([data, geo_encoded_df], axis=1)
data = data.drop(columns=['Geography'], axis=1)
data.head()
# Split the data into features and target
X = data.drop(columns=['EstimatedSalary'], axis=1) # Features
y = data['EstimatedSalary'] # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale these features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
scaler.feature_names_in_ = X.columns

# Save the encoded data and scaler for late use
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f)
with open('onehot_encoder_geo.pkl', 'wb') as f:
    pickle.dump(onehot_encoder_geo, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
# ANN Regression problem statement
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1)) # Output layer for regression
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Summary of the model
model.summary()


from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

# Setup tensorboard
log_dir = "regressionlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[tensorboard_callback, early_stopping])
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test loss: {test_loss}')
print(f'Test MAE: {test_mae}')

# save the model
model.save('regression_model.h5')

# Launch tensorboard
print("To view the tensorboard, run the following command in the terminal:")
print("tensorboard --logdir regressionlogs/fit")
%load_ext tensorboard

# Launch tensorboard session
%tensorboard --logdir regressionlogs/fit

数据不包含缺失值。
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                832       
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,945
Trainable params: 2,945
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epo

Reusing TensorBoard on port 6006 (pid 59514), started 0:09:32 ago. (Use '!kill 59514' to kill it.)