In [None]:
import pandas as pd
import featuretools as ft
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
# Load a synthetic dataset
data = {
    'customer_id': [1, 2, 3, 4],
    'customer_age': [34, 25, 45, 50],
    'transactions': [5, 2, 8, 10],
    'total_spent': [250, 100, 400, 500],
    'average_spent': [50, 50, 50, 50]
}
df = pd.DataFrame(data)

# Display the dataset
print("Dataset:\n", df)


In [None]:
# Create an EntitySet
es = ft.EntitySet(id="customer_data")

# Add the main dataframe as an entity
es = es.entity_from_dataframe(
    entity_id="customers",
    dataframe=df,
    index="customer_id"
)

print("\nEntitySet Created:")
print(es)


In [None]:
# Perform Deep Feature Synthesis
features, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean", "sum", "count"],  # Aggregation primitives
    trans_primitives=["divide", "add", "multiply"],  # Transformation primitives
    max_depth=2  # Depth of feature generation
)

# Display generated features
print("\nGenerated Features:\n", features)


In [None]:
# Prepare the target variable
features['is_high_spender'] = (features['total_spent'] > 300).astype(int)

# Split the data
X = features.drop(columns=['is_high_spender'])
y = features['is_high_spender']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy with Generated Features: {accuracy:.4f}")


In [None]:
# Save features
features.to_csv("generated_features.csv", index=False)
print("\nFeatures saved successfully!")

# Reload features
loaded_features = pd.read_csv("generated_features.csv")
print("Features reloaded successfully!")


In [None]:
import matplotlib.pyplot as plt

# Retrieve feature importance
importance = rf.feature_importances_
features_list = X.columns

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(features_list, importance, color="skyblue")
plt.xlabel("Importance")
plt.title("Feature Importance")
plt.show()
