# Model Training

In this notebook, we will explore training machine learning models on the available data.

## Objective

The objective could be:
- Predicting customer churn
- Recommending products to customers

## Data Preparation

```python
import pandas as pd

# Load data
customers = pd.read_csv('../data/Customers.csv')
purchase_history = pd.read_csv('../data/Purchase_History.csv')


In [90]:
# Load necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [91]:
# Load data
customers = pd.read_csv('../data/customers.csv')
purchase_history = pd.read_csv('../data/purchase_history.csv')

# Merge data to create a unified dataset
merged_data = pd.merge(customers, purchase_history, on='customer_id', how='inner')

In [92]:
# Calculate total amount spent by each customer
total_amount_spent = merged_data.groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_spent.columns = ['customer_id', 'total_amount_spent']

# Calculate number of purchases made by each customer
num_purchases = merged_data.groupby('customer_id').size().reset_index(name='num_purchases')

# Calculate average purchase amount per customer
avg_purchase_amount = merged_data.groupby('customer_id')['total_amount'].mean().reset_index()
avg_purchase_amount.columns = ['customer_id', 'avg_purchase_amount']

In [101]:
# Merge features into a single DataFrame
customer_features = pd.merge(total_amount_spent, num_purchases, on='customer_id', how='inner')
customer_features = pd.merge(customer_features, avg_purchase_amount, on='customer_id', how='inner')

In [102]:
# Assuming 'churn' is defined based on no purchase in the last 3 months
last_purchase_date = pd.to_datetime(merged_data.groupby('customer_id')['purchase_date'].max()).reset_index()
print((pd.Timestamp('now') - last_purchase_date['purchase_date']))
print(pd.Timedelta(days=180))
last_purchase_date['churn'] = ((pd.Timestamp('now') - last_purchase_date['purchase_date'])) > pd.Timedelta(days=90)

# Merge with customer features
customer_features = pd.merge(customer_features, last_purchase_date[['customer_id', 'churn']], on='customer_id', how='inner')

0       45 days 01:42:49.133571
1       30 days 01:42:49.133571
2       64 days 01:42:49.133571
3      119 days 01:42:49.133571
4       84 days 01:42:49.133571
                 ...           
4321    61 days 01:42:49.133571
4322   172 days 01:42:49.133571
4323    45 days 01:42:49.133571
4324     4 days 01:42:49.133571
4325   165 days 01:42:49.133571
Name: purchase_date, Length: 4326, dtype: timedelta64[ns]
180 days 00:00:00


In [103]:
# Assuming churn label based on some business rule (e.g., no purchase in the last 6 months)
y = customer_features['churn']
print(y)

0       False
1       False
2       False
3        True
4       False
        ...  
4321    False
4322     True
4323    False
4324    False
4325     True
Name: churn, Length: 4326, dtype: bool


In [104]:
# Split data into training and test sets
X = customer_features[['total_amount_spent', 'num_purchases', 'avg_purchase_amount']]
y = customer_features['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
# Initialize the model
model = LogisticRegression(random_state=42)

In [106]:
print(y_train.value_counts())

churn
False    2534
True      926
Name: count, dtype: int64


In [107]:
# Train the model
model.fit(X_train, y_train)

In [108]:
# Predict on the test set
y_pred = model.predict(X_test)

In [109]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred))

Model accuracy: 0.7494226327944573
              precision    recall  f1-score   support

       False       0.75      1.00      0.86       649
        True       0.00      0.00      0.00       217

    accuracy                           0.75       866
   macro avg       0.37      0.50      0.43       866
weighted avg       0.56      0.75      0.64       866



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
