## Importing all the sources of data

In [4]:
import pandas as pd
import boto3
from io import StringIO

# Define S3 bucket and the folder where data source CSVs are stored
bucket = "my-churnshield-data"
folder = "Data Sources/"

# Define a dictionary mapping source names to their CSV filenames
files = {
    "billing": "billing_data.csv",
    "hubspot": "hubspot_data.csv",
    "salesforce": "salesforce_data.csv",
    "zendesk": "zendesk_data.csv"
}

# Create an S3 client
s3_client = boto3.client("s3")

# Dictionary to store DataFrames
dataframes = {}

# Loop over the files and import each CSV into a Pandas DataFrame
for source, filename in files.items():
    key = folder + filename
    print(f"Loading {source} data from s3://{bucket}/{key}...")
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    data = obj["Body"].read().decode("utf-8")
    # Read CSV; adjust header=None and names=[...] if your CSVs lack headers
    df = pd.read_csv(StringIO(data))
    dataframes[source] = df
    print(f"{source.capitalize()} Data Sample:")
    print(df.head())
    print("\n")


Loading billing data from s3://my-churnshield-data/Data Sources/billing_data.csv...
Billing Data Sample:
                                  Billing_ID     UserID SubscriptionPlan  \
0  BILL-27eba3ee-e298-4348-98ed-9fa8474ff727  USER-8359            Basic   
1  BILL-6682bc84-8806-42b5-ab80-df08b7f5b5ca  USER-3216       Enterprise   
2  BILL-c3d9c47b-032f-4c7e-8322-6549ee14b4c8  USER-5794       Enterprise   
3  BILL-44884268-8956-44a1-ab2a-37fc9277f0c9  USER-6822       Enterprise   
4  BILL-edecccfc-da79-423f-b17c-f8952bbd559f  USER-1518       Enterprise   

   Amount Currency             TransactionDate  PaymentMethod PaymentStatus  
0   30.39      USD  2024-12-28T20:41:39.615550  Wire Transfer       Pending  
1  145.33      USD  2024-11-28T00:04:24.702024         PayPal        Failed  
2  491.75      USD  2024-08-22T14:09:02.394898  Wire Transfer        Failed  
3  185.83      USD  2024-03-22T16:31:45.238658    Credit Card       Pending  
4  183.16      USD  2025-01-06T13:06:24.559926  

## Salesforce 

In [11]:
pip install Faker

Collecting Faker
  Using cached Faker-36.1.1-py3-none-any.whl.metadata (15 kB)
Using cached Faker-36.1.1-py3-none-any.whl (1.9 MB)
Installing collected packages: Faker
Successfully installed Faker-36.1.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import random
import uuid
from faker import Faker

fake = Faker()

num_rows = 100  # synthetic rows 

synthetic_data = []
for _ in range(num_rows):
    salesforce_id = "SF-" + str(uuid.uuid4())[:8]  # short random ID
    created_date = fake.date_time_between(start_date="-2y", end_date="now").isoformat()
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = fake.email()
    phone = fake.phone_number()
    company = fake.company()
    lead_source = random.choice(["Trade Show", "Cold Call", "Referral", "Web", "Partner"])
    status = random.choice(["New", "Working", "Nurturing", "Converted", "Closed - Lost"])
    churn = random.choice([0, 1])  # random churn label

    synthetic_data.append([
        salesforce_id,
        created_date,
        first_name,
        last_name,
        email,
        phone,
        company,
        lead_source,
        status,
        churn
    ])

columns = [
    "Salesforce_ID", "CreatedDate", "FirstName", "LastName",
    "Email", "Phone", "Company", "LeadSource", "Status", "Churn"
]
df_synthetic = pd.DataFrame(synthetic_data, columns=columns)

# Save to CSV
df_synthetic.to_csv("salesforce_data_synthetic.csv", index=False)
print("Fully synthetic Salesforce-like data created: salesforce_data_synthetic.csv")


Fully synthetic Salesforce-like data created: salesforce_data_synthetic.csv


In [7]:
## TRAINING THE MODEL
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# 1. Load synthetic data from CSV
df = pd.read_csv("salesforce_data_synthetic.csv")
print("Data Sample:")
print(df.head())

# 2. Select features and target
# We'll use 'LeadSource' and 'Status' as features, and 'Churn' as the target.
features = df[["LeadSource", "Status"]]
target = df["Churn"]

# 3. One-hot encode the categorical features
features_encoded = pd.get_dummies(features)
print("\nEncoded Features Sample:")
print(features_encoded.head())
# Save the columns as a list
training_columns = list(features_encoded.columns)
print("Training columns:", training_columns)

# Store them in a file or Python variable for reuse
import json
with open("salesforce_columns.json", "w") as f:
    json.dump(training_columns, f)

# 4. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# 5. Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# 6. Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 7. Save the trained model to a file for later use
joblib.dump(clf, "salesforce_model.joblib")
print("Salesforce model saved to salesforce_model.joblib")


Data Sample:
  Salesforce_ID                 CreatedDate FirstName   LastName  \
0   SF-f135acaf  2024-12-31T16:39:51.357132   Melissa  Rodriguez   
1   SF-f0a2cd3c  2023-03-14T20:05:52.240183    Adrian      Chang   
2   SF-89c87ae8  2024-10-13T18:24:38.363084   Michael      Craig   
3   SF-2cb14743  2024-07-28T15:27:55.404905     Holly     Larson   
4   SF-1906782f  2023-03-03T19:16:57.754639      Lisa      Allen   

                         Email                Phone           Company  \
0     burgesscraig@example.org  (659)733-3378x70650       Maxwell Ltd   
1        seanmunoz@example.org        (985)734-3832       Wong-Becker   
2     douglaskaren@example.org    399-626-4173x8661   Fisher and Sons   
3        vincent27@example.com         751.484.6636      Hamilton LLC   
4  chambersmatthew@example.net         273.690.7188  Kaufman-Cardenas   

   LeadSource     Status  Churn  
0    Referral    Working      0  
1         Web    Working      1  
2         Web  Converted      1  
3  

In [1]:
import json
import boto3


#SageMaker endpoint name
ENDPOINT_NAME = "sagemaker-xgboost-2025-02-20-03-57-21-973"

# Create a SageMaker runtime client
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name="us-east-2")

def lambda_handler(event, context):
    try:
        # Parse the request body (assumes application/json content type)
        body = json.loads(event.get("body", "{}"))
        features = body.get("features")  # e.g., a list of feature values

        if not features:
            return {
                "statusCode": 400,
                "body": json.dumps({"error": "Missing 'features' in request"})
            }

        # Convert feature list to a CSV-formatted string (as expected by the model)
        csv_payload = ",".join(map(str, features))

        # Invoke the SageMaker endpoint with the CSV payload
        response = sagemaker_runtime.invoke_endpoint(
            EndpointName=ENDPOINT_NAME,
            ContentType="text/csv",
            Body=csv_payload
        )

        # Read the prediction response from the model
        prediction = response["Body"].read().decode("utf-8")

        # Return the prediction as JSON
        return {
            "statusCode": 200,
            "body": json.dumps({
                "prediction": prediction,
                "features": features
            })
        }

    except Exception as e:
        return {
            "statusCode": 500,
            "body": json.dumps({"error": str(e)})
        }


# Create a SageMaker runtime client
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name="us-east-2")

def lambda_handler(event, context):
    try:
        # Parse the request body (assumes application/json content type)
        body = json.loads(event.get("body", "{}"))
        features = body.get("features")  # e.g., a list of feature values

        if not features:
            return {
                "statusCode": 400,
                "body": json.dumps({"error": "Missing 'features' in request"})
            }

        # Convert feature list to a CSV-formatted string (as expected by the model)
        csv_payload = ",".join(map(str, features))

        # Invoke the SageMaker endpoint with the CSV payload
        response = sagemaker_runtime.invoke_endpoint(
            EndpointName=ENDPOINT_NAME,
            ContentType="text/csv",
            Body=csv_payload
        )

        # Read the prediction response from the model
        prediction = response["Body"].read().decode("utf-8")

        # Return the prediction as JSON
        return {
            "statusCode": 200,
            "body": json.dumps({
                "prediction": prediction,
                "features": features
            })
        }

    except Exception as e:
        return {
            "statusCode": 500,
            "body": json.dumps({"error": str(e)})
        }
        
print("Done")

Done


In [6]:
from IPython.display import FileLink

# For the raw .joblib file:
display(FileLink('salesforce_model.joblib'))

In [8]:
## Using model to predict Salesforce churn

import pandas as pd
import boto3
from io import StringIO
import joblib
import json

# -------------------------------
# Step 1: Download client's Salesforce data from S3
# -------------------------------
bucket = "my-churnshield-data"
key = "Data Sources/salesforce_data.csv"

s3_client = boto3.client("s3", region_name="us-east-2")
obj = s3_client.get_object(Bucket=bucket, Key=key)
data = obj["Body"].read().decode("utf-8")

# Load the CSV data into a DataFrame
df_clients = pd.read_csv(StringIO(data))
print("Client Salesforce Data Sample:")
print(df_clients.head())

# -------------------------------
# Step 2: Preprocess the data to extract features
# -------------------------------
# We will use only 'LeadSource' and 'Status' as the model features.
features = df_clients[["LeadSource", "Status"]]

# One-hot encode the categorical features
features_encoded = pd.get_dummies(features)
print("\nEncoded Client Features Sample:")
print(features_encoded.head())

# -------------------------------
# Step 3: Load the training columns from JSON
# -------------------------------
# This JSON file was created during training, containing the exact dummy columns.
with open("salesforce_columns.json", "r") as f:
    expected_columns = json.load(f)

print("\nColumns from training (salesforce_columns.json):")
print(expected_columns)

# Reindex the encoded DataFrame to have the expected columns, filling missing columns with 0
features_encoded = features_encoded.reindex(columns=expected_columns, fill_value=0)
print("\nReindexed Encoded Features (should have", len(expected_columns), "columns):")
print(features_encoded.head())

# -------------------------------
# Step 4: Load the trained Salesforce-specific model
# -------------------------------
model = joblib.load("salesforce_model.joblib")

# -------------------------------
# Step 5: Generate churn predictions
# -------------------------------
predictions = model.predict(features_encoded)
df_clients["Predicted_Churn"] = predictions

# Display a few predictions alongside key columns
print("\nPredictions on Client Data:")
print(df_clients[["Salesforce_ID", "LeadSource", "Status", "Predicted_Churn"]].head())

# -------------------------------
# Step 6: Save the DataFrame with predictions to a new CSV file
# -------------------------------
output_file = "salesforce_data_with_predictions.csv"
df_clients.to_csv(output_file, index=False)
print("\nSaved predictions to", output_file)


Client Salesforce Data Sample:
                             Salesforce_ID                 CreatedDate  \
0  SF-922e8ed5-0bf3-4b2f-8b01-46504b5695b9  2024-08-25T13:56:14.111596   
1  SF-d4eee776-5278-4335-90fd-a18a57565da6  2024-12-10T20:11:12.933288   
2  SF-eab2ec1f-a41d-4c16-a727-3a821d381ac0  2024-08-06T08:45:38.398177   
3  SF-a679d3c4-e9db-4573-a7b5-10f4b28150c8  2024-06-03T16:55:19.829209   
4  SF-4fcb66c4-eee6-4926-9d1d-f529bffa916d  2025-01-05T03:47:36.146496   

   FirstName LastName                         Email                  Phone  \
0   Patricia  Trevino      robertsjacob@example.org  +1-266-594-2273x09353   
1      Ricky    Brady         michael39@example.net          (249)214-2262   
2    Annette    Hines  ronaldrichardson@example.net  001-488-964-5881x1286   
3    Annette  Hoffman         william67@example.net   +1-396-457-5053x8910   
4  Cassandra   Cortez        shawnasims@example.org  001-481-634-0644x9584   

           Company  LeadSource         Status  
0     T

In [9]:
import json
import boto3
import pandas as pd
from io import StringIO

# Configuration
BUCKET = "my-churnshield-data"
KEY = "Predictions/salesforce_data_with_predictions.csv"  

def lambda_handler(event, context):
    try:
        # Create S3 client
        s3_client = boto3.client("s3", region_name="us-east-2")
        
        # Get the CSV file from S3
        obj = s3_client.get_object(Bucket=BUCKET, Key=KEY)
        data = obj["Body"].read().decode("utf-8")
        
        # Read CSV into a DataFrame
        df = pd.read_csv(StringIO(data))
        
        # Optionally, can convert the DataFrame to JSON
        # For example, converting the entire DataFrame to a list of dictionaries:
        predictions = df.to_dict(orient="records")
        
        # Return the predictions as a JSON response
        return {
            "statusCode": 200,
            "body": json.dumps({
                "predictions": predictions
            }),
            "headers": {
                "Content-Type": "application/json",
                "Access-Control-Allow-Origin": "*"  
            }
        }
    
    except Exception as e:
        return {
            "statusCode": 500,
            "body": json.dumps({"error": str(e)}),
            "headers": {
                "Content-Type": "application/json",
                "Access-Control-Allow-Origin": "*"
            }
        }


## Hubspot

In [12]:
## SYNTHETIC DATA
import pandas as pd
import random
import uuid
from faker import Faker

fake = Faker()

num_rows = 100
lifecycle_stages = [
    "Lead",
    "Opportunity",
    "Marketing Qualified Lead",
    "Sales Qualified Lead",
    "Customer",
    "Subscriber"
]

synthetic_data = []
for _ in range(num_rows):
    hubspot_id = "HS-" + str(uuid.uuid4())[:8]
    created_on = fake.date_time_between(start_date="-2y", end_date="now").isoformat()
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = fake.email()
    lifecycle_stage = random.choice(lifecycle_stages)
    website = fake.url()
    
    lead_score = random.randint(0, 100)

    # lead_score looks indicative of churn here and we may be able to attribute it to churn. 
    # To create more "real life" data, let's go with that pattern here
    # If lead_score < 30 => more likely to churn
    if lead_score < 30:
        churn = 1
    else:
        # Maybe 20% chance of churn otherwise
        churn = 1 if random.random() < 0.2 else 0

    synthetic_data.append([
        hubspot_id,
        created_on,
        first_name,
        last_name,
        email,
        lifecycle_stage,
        website,
        lead_score,
        churn
    ])

columns = [
    "HubSpot_ID", "CreatedOn", "FirstName", "LastName",
    "Email", "LifecycleStage", "Website", "LeadScore", "Churn"
]

df_synthetic = pd.DataFrame(synthetic_data, columns=columns)
df_synthetic.to_csv("hubspot_data_synthetic.csv", index=False)
print("Synthetic Hubspot data with churn created: hubspot_data_synthetic.csv")


Synthetic Hubspot data with churn created: hubspot_data_synthetic_with_churn.csv


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import json

# 1. Load synthetic Hubspot data from CSV
df = pd.read_csv("hubspot_data_synthetic.csv")
print("Data Sample:")
print(df.head())

# 2. Select features and target
# We'll use 'LifecycleStage' (categorical) and 'LeadScore' (numeric) as features, and 'Churn' as the target.
features = df[["LifecycleStage", "LeadScore"]]
target = df["Churn"]

# 3. One-hot encode the 'LifecycleStage' column
# Keep 'LeadScore' as is, since it's numeric
lifecycle_encoded = pd.get_dummies(features["LifecycleStage"], prefix="LifecycleStage")

# Combine the encoded lifecycle columns with the numeric LeadScore
features_encoded = pd.concat([lifecycle_encoded, features["LeadScore"]], axis=1)

print("\nEncoded Features Sample:")
print(features_encoded.head())

# Save the columns as a list for inference-time reindexing
training_columns = list(features_encoded.columns)
print("Training columns:", training_columns)

# Store them in a JSON file for reuse during inference
with open("hubspot_columns.json", "w") as f:
    json.dump(training_columns, f)

# 4. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# 5. Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# 6. Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 7. Save the trained model to a file for later use
joblib.dump(clf, "hubspot_model.joblib")
print("\nHubspot model saved to hubspot_model.joblib")


Data Sample:
    HubSpot_ID                   CreatedOn FirstName  LastName  \
0  HS-c62ae2ee  2023-07-10T15:01:28.152665    Jeremy     Sharp   
1  HS-d3ce946e  2025-02-13T17:46:33.654093    Angela     Heath   
2  HS-b386c5a7  2024-09-30T05:47:46.026878     Wendy    Bowers   
3  HS-da142366  2024-12-17T21:01:46.441043     David  Thompson   
4  HS-b5fac00d  2024-07-21T20:44:24.460131    Kristi      Reed   

                         Email            LifecycleStage  \
0          maria28@example.com                  Customer   
1     robertgarcia@example.com               Opportunity   
2     cortezjustin@example.net  Marketing Qualified Lead   
3           wellis@example.com  Marketing Qualified Lead   
4  fergusondeborah@example.net                  Customer   

                          Website  LeadScore  Churn  
0              http://watson.org/         78      0  
1         https://www.murray.com/         35      0  
2  http://www.mcdaniel-gomez.org/         91      0  
3       https

In [16]:
from IPython.display import FileLink

# For the raw .joblib file:
display(FileLink('hubspot_model.joblib'))

In [17]:
import pandas as pd
import boto3
from io import StringIO
import joblib
import json

# -------------------------------
# Step 1: Download client's Hubspot data from S3
# -------------------------------
bucket = "my-churnshield-data"
key = "Data Sources/hubspot_data.csv"  

s3_client = boto3.client("s3", region_name="us-east-2")
obj = s3_client.get_object(Bucket=bucket, Key=key)
data = obj["Body"].read().decode("utf-8")

# Load the CSV data into a DataFrame
df_clients = pd.read_csv(StringIO(data))
print("Client Hubspot Data Sample:")
print(df_clients.head())

# -------------------------------
# Step 2: Preprocess the data to extract features
# -------------------------------
# For Hubspot, we'll use 'LifecycleStage' (categorical) and 'LeadScore' (numeric) as features.
features = df_clients[["LifecycleStage", "LeadScore"]]

# One-hot encode the 'LifecycleStage' column; keep 'LeadScore' numeric.
lifecycle_encoded = pd.get_dummies(features["LifecycleStage"], prefix="LifecycleStage")
features_encoded = pd.concat([lifecycle_encoded, features["LeadScore"]], axis=1)
print("\nEncoded Client Features Sample:")
print(features_encoded.head())

# -------------------------------
# Step 3: Load the training columns from JSON
# -------------------------------
# This JSON file was created during training, containing the exact dummy columns.
with open("hubspot_columns.json", "r") as f:
    expected_columns = json.load(f)

print("\nColumns from training (hubspot_columns.json):")
print(expected_columns)

# Reindex the encoded DataFrame to have the expected columns, filling missing columns with 0
features_encoded = features_encoded.reindex(columns=expected_columns, fill_value=0)
print("\nReindexed Encoded Features (should have", len(expected_columns), "columns):")
print(features_encoded.head())

# -------------------------------
# Step 4: Load the trained Hubspot-specific model
# -------------------------------
model = joblib.load("hubspot_model.joblib")

# -------------------------------
# Step 5: Generate churn predictions
# -------------------------------
predictions = model.predict(features_encoded)
df_clients["Predicted_Churn"] = predictions

# Display a few predictions alongside key columns
print("\nPredictions on Client Hubspot Data:")
print(df_clients[["HubSpot_ID", "LifecycleStage", "LeadScore", "Predicted_Churn"]].head())

# -------------------------------
# Step 6: Save the DataFrame with predictions to a new CSV file
# -------------------------------
output_file = "hubspot_data_with_predictions.csv"
df_clients.to_csv(output_file, index=False)
print("\nSaved predictions to", output_file)


Client Hubspot Data Sample:
                                HubSpot_ID                   CreatedOn  \
0  HS-1c5e17ac-28ad-4771-b385-62a710336f69  2024-08-27T21:03:33.090348   
1  HS-451b47c1-0ed0-4da4-93a4-f1b3a8ebf219  2025-01-03T04:35:43.780510   
2  HS-f2ed3f7c-a4ca-40f1-aba3-a2ca6ba1bf6e  2024-07-10T06:11:38.096262   
3  HS-54afff7f-a327-4fb5-a8b9-60b9c036d567  2025-01-04T15:03:45.793348   
4  HS-526b530f-5c09-407d-9c33-673fd4a516e2  2024-11-01T23:27:35.954706   

  FirstName  LastName                   Email            LifecycleStage  \
0     Molly  Mitchell     lrhodes@example.net                      Lead   
1  Kimberly   Krueger   matthew74@example.net               Opportunity   
2    Steven    Oliver    bonnie60@example.org  Marketing Qualified Lead   
3      Mark      Rush    randrews@example.net      Sales Qualified Lead   
4       Amy    Chaney  colinblake@example.org                      Lead   

                         Website  LeadScore  
0  https://edwards-robinson.bi

In [18]:
import json
import boto3
import pandas as pd
from io import StringIO

# Configuration for Hubspot predictions
BUCKET = "my-churnshield-data"
KEY = "Predictions/hubspot_data_with_predictions.csv"  

def lambda_handler(event, context):
    try:
        # Create an S3 client in the correct region
        s3_client = boto3.client("s3", region_name="us-east-2")
        
        # Retrieve the CSV file from S3
        obj = s3_client.get_object(Bucket=BUCKET, Key=KEY)
        data = obj["Body"].read().decode("utf-8")
        
        # Load the CSV data into a pandas DataFrame
        df = pd.read_csv(StringIO(data))
        
        # Convert the DataFrame into a list of dictionaries
        predictions = df.to_dict(orient="records")
        
        # Return the predictions as a JSON response with appropriate headers
        return {
            "statusCode": 200,
            "body": json.dumps({
                "predictions": predictions
            }),
            "headers": {
                "Content-Type": "application/json",
                "Access-Control-Allow-Origin": "*"  
            }
        }
    
    except Exception as e:
        return {
            "statusCode": 500,
            "body": json.dumps({"error": str(e)}),
            "headers": {
                "Content-Type": "application/json",
                "Access-Control-Allow-Origin": "*"
            }
        }


## Zendesk

In [22]:
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /home/sagemaker-
[nltk_data]     user/nltk_data...


True

In [25]:
## CREATING SYNTHETIC DATA

import pandas as pd
import random
import uuid
from faker import Faker
from datetime import timedelta
import nltk
nltk.download('vader.lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from io import StringIO

fake = Faker()

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

num_rows = 100
statuses = ["Open", "Solved", "New", "Pending", "Closed"]
priorities = ["Low", "Normal", "High", "Urgent"]

def generate_random_sentence():
    """
    Generate a short random sentence using Faker for more variety.
    Can also inject specific negative/positive words.
    """
    # We'll just use a couple of random sentences from Faker
    sentence1 = fake.sentence(nb_words=6)
    sentence2 = fake.sentence(nb_words=6)
    return sentence1 + " " + sentence2

synthetic_data = []
for _ in range(num_rows):
    ticket_id = "ZD-" + str(uuid.uuid4())[:8]
    
    # Random submitted time in the last 2 years
    submitted_at = fake.date_time_between(start_date="-2y", end_date="now")
    # updated_at is some random time after submitted_at (up to 30 days)
    updated_at = submitted_at + timedelta(days=random.randint(0, 30))

    user_id = "USER-" + str(random.randint(1000, 9999))
    
    # Generate subject + description
    subject_text = generate_random_sentence()
    desc_text = generate_random_sentence()

    status = random.choice(statuses)
    priority = random.choice(priorities)
    agent_name = fake.name()

    # Combine subject and description for sentiment analysis
    combined_text = subject_text + " " + desc_text
    
    # Use VADER to get sentiment scores
    scores = sid.polarity_scores(combined_text)
    compound_score = scores['compound']
    
    # Decide churn based on compound score
    # Can adjust threshold to preference (e.g., -0.2, -0.1, etc.)
    if compound_score < -0:
        churn = 1
    else:
        churn = 0

    synthetic_data.append([
        ticket_id,
        submitted_at.isoformat(),
        updated_at.isoformat(),
        user_id,
        subject_text,
        desc_text,
        status,
        priority,
        agent_name,
        churn
    ])

columns = [
    "Ticket_ID", "SubmittedAt", "UpdatedAt", "UserID",
    "TicketSubject", "TicketDescription", "Status", "Priority", "Agent",
    "Churn"
]

df_synthetic = pd.DataFrame(synthetic_data, columns=columns)
df_synthetic.to_csv("zendesk_data_synthetic.csv", index=False)
print("Synthetic Zendesk data with sentiment-based churn created: zendesk_data_synthetic.csv")


Synthetic Zendesk data with sentiment-based churn created: zendesk_data_synthetic.csv


[nltk_data] Error loading vader.lexicon: Package 'vader.lexicon' not
[nltk_data]     found in index


In [26]:
## TRAINING MODEL

import pandas as pd
import joblib
import json
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Load synthetic Zendesk data from CSV
df = pd.read_csv("zendesk_data_synthetic.csv")
print("Zendesk Data Sample:")
print(df.head())

# 2. Preprocess: Combine TicketSubject and TicketDescription for sentiment analysis
df["CombinedText"] = df["TicketSubject"] + " " + df["TicketDescription"]

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Compute the compound sentiment score for each ticket
df["CompoundScore"] = df["CombinedText"].apply(lambda text: sid.polarity_scores(text)["compound"])

# For modeling, we'll use the CompoundScore as the single feature.
features = df[["CompoundScore"]]
target = df["Churn"]

print("\nFeature Sample:")
print(features.head())

# 3. Save the training columns for later inference
# (In this case, it will simply be ["CompoundScore"])
training_columns = list(features.columns)
with open("zendesk_columns.json", "w") as f:
    json.dump(training_columns, f)
print("Training columns:", training_columns)

# 4. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# 5. Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# 6. Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 7. Save the trained model to a file for later use
joblib.dump(clf, "zendesk_model.joblib")
print("\nZendesk model saved to zendesk_model.joblib")


Zendesk Data Sample:
     Ticket_ID                 SubmittedAt                   UpdatedAt  \
0  ZD-9e8d9cc8  2024-10-27T20:38:32.780454  2024-11-25T20:38:32.780454   
1  ZD-9831a2e7  2024-12-11T05:53:12.712889  2024-12-27T05:53:12.712889   
2  ZD-002ee304  2024-12-09T23:32:20.494166  2025-01-03T23:32:20.494166   
3  ZD-b9194fdd  2024-08-10T10:38:10.863898  2024-08-17T10:38:10.863898   
4  ZD-17615042  2024-09-04T11:51:36.325238  2024-09-18T11:51:36.325238   

      UserID                                      TicketSubject  \
0  USER-2163  Affect change evidence and sing table. Electio...   
1  USER-3743  Section off everything turn artist tell. Study...   
2  USER-6612  Try school still one reality politics rest. Wo...   
3  USER-4100  Contain popular one open better lot someone. P...   
4  USER-1748  Dinner past long cup purpose hot. Nature assum...   

                                   TicketDescription   Status Priority  \
0  Artist drug huge room under. Itself guess of m...     

In [27]:
from IPython.display import FileLink

# For the raw .joblib file:
display(FileLink('zendesk_model.joblib'))

In [30]:
## Creating predictions for Zendesk

import pandas as pd
import boto3
from io import StringIO
import joblib
import json
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# -------------------------------
# Step 1: Download client's Zendesk data from S3
# -------------------------------
bucket = "my-churnshield-data"
key = "Data Sources/zendesk_data.csv" 

s3_client = boto3.client("s3", region_name="us-east-2")
obj = s3_client.get_object(Bucket=bucket, Key=key)
data = obj["Body"].read().decode("utf-8")

# Load the CSV data into a DataFrame
df_clients = pd.read_csv(StringIO(data))
print("Client Zendesk Data Sample:")
print(df_clients.head())

# -------------------------------
# Step 2: Preprocess data to extract features
# -------------------------------
# Combine TicketSubject and TicketDescription for sentiment analysis
df_clients["CombinedText"] = df_clients["TicketSubject"] + " " + df_clients["TicketDescription"]

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Compute the compound sentiment score for each ticket
df_clients["CompoundScore"] = df_clients["CombinedText"].apply(lambda text: sid.polarity_scores(text)["compound"])

# For modeling, we'll use "CompoundScore" as our feature
features_encoded = df_clients[["CompoundScore"]]
print("\nEncoded Zendesk Features Sample:")
print(features_encoded.head())

# -------------------------------
# Step 3: Load the training columns from JSON
# -------------------------------
# This file should contain the exact feature columns used during training (e.g., ["CompoundScore"])
with open("zendesk_columns.json", "r") as f:
    expected_columns = json.load(f)

print("\nColumns from training (zendesk_columns.json):")
print(expected_columns)

# Reindex the DataFrame to match the expected columns
features_encoded = features_encoded.reindex(columns=expected_columns, fill_value=0)
print("\nReindexed Encoded Features (should have", len(expected_columns), "columns):")
print(features_encoded.head())

# -------------------------------
# Step 4: Load the trained Zendesk-specific model
# -------------------------------
model = joblib.load("zendesk_model.joblib")

# -------------------------------
# Step 5: Generate churn predictions
# -------------------------------
predictions = model.predict(features_encoded)
df_clients["Predicted_Churn"] = predictions

# Display a few predictions alongside key columns
print("\nPredictions on Client Zendesk Data:")
print(df_clients[["Ticket_ID", "TicketSubject", "TicketDescription", "Predicted_Churn"]].head())

# -------------------------------
# Step 6: Save the DataFrame with predictions to a new CSV file
# -------------------------------
output_file = "zendesk_data_with_predictions.csv"
df_clients.to_csv(output_file, index=False)
print("\nSaved predictions to", output_file)


Client Zendesk Data Sample:
                                 Ticket_ID                 SubmittedAt  \
0  ZD-b59bc9dc-a911-4ed5-b286-47dcd465d527  2024-07-03T05:21:11.206803   
1  ZD-7bec3e64-dad6-45f2-8b95-f12d6fed915a  2024-12-08T22:54:35.805105   
2  ZD-cf900bc2-7663-43fc-b425-4e2f7611127c  2024-05-07T00:16:50.778418   
3  ZD-0d561782-be40-4f1d-afe8-f5a0dd95a32b  2025-01-12T16:50:56.347338   
4  ZD-1d6c76a9-4761-4ec0-b020-36b127a36181  2024-09-03T14:44:16.091109   

                    UpdatedAt     UserID  \
0  2024-07-03T05:21:11.206803  USER-7191   
1  2025-01-05T22:54:35.805105  USER-4471   
2  2024-05-13T00:16:50.778418  USER-7103   
3  2025-01-26T16:50:56.347338  USER-6479   
4  2024-09-06T14:44:16.091109  USER-9602   

                           TicketSubject  \
0  Trouble box center ok accept science.   
1      Always attorney individual scene.   
2            How safe so claim left own.   
3            Imagine face evidence road.   
4  Direction leave technology important.  

## Billing

In [31]:
## CREATING SYNTHETIC DATA

import pandas as pd
import random
import uuid
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

num_rows = 100

# Possible subscription plans
subscription_plans = ["Basic", "Standard", "Premium", "Enterprise"]

# Payment methods and statuses
payment_methods = ["Credit Card", "Wire Transfer", "PayPal"]
payment_statuses = ["Completed", "Pending", "Failed"]

synthetic_data = []

for _ in range(num_rows):
    billing_id = "BILL-" + str(uuid.uuid4())[:8]  # short random ID
    user_id = "USER-" + str(random.randint(1000, 9999))

    # Random subscription plan
    subscription_plan = random.choice(subscription_plans)

    # Generate a random amount, e.g. from 10 to 500
    amount = round(random.uniform(10, 500), 2)

    currency = "USD"

    # Random transaction date in the last 2 years
    transaction_date = fake.date_time_between(start_date="-2y", end_date="now").isoformat()

    # Random payment method and status
    payment_method = random.choice(payment_methods)
    payment_status = random.choice(payment_statuses)

    # ----- Churn Logic -----
    # Example approach:
    #  - If payment_status == "Failed" => high chance churn=1
    #  - If payment_status == "Completed" => likely churn=0
    #  - If subscription_plan == "Basic" => slightly higher chance of churn
    #  - Otherwise random

    # Start with a base churn probability
    churn_prob = 0.2

    # Increase probability if subscription plan is Basic
    if subscription_plan == "Basic":
        churn_prob += 0.2

    # Payment status weighting
    if payment_status == "Failed":
        churn_prob += 0.5  # Much higher chance
    elif payment_status == "Completed":
        churn_prob -= 0.1  # Less chance

    # Cap churn_prob between 0 and 1
    churn_prob = min(max(churn_prob, 0), 1)

    # Decide churn based on final churn_prob
    churn = 1 if random.random() < churn_prob else 0

    synthetic_data.append([
        billing_id,
        user_id,
        subscription_plan,
        amount,
        currency,
        transaction_date,
        payment_method,
        payment_status,
        churn
    ])

columns = [
    "Billing_ID",
    "UserID",
    "SubscriptionPlan",
    "Amount",
    "Currency",
    "TransactionDate",
    "PaymentMethod",
    "PaymentStatus",
    "Churn"
]

df_synthetic = pd.DataFrame(synthetic_data, columns=columns)
df_synthetic.to_csv("billing_data_synthetic.csv", index=False)
print("Synthetic Billing data created: billing_data_synthetic.csv")


Synthetic Billing data created: billing_data_synthetic.csv


In [32]:
import pandas as pd
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Load synthetic Billing data from CSV
df = pd.read_csv("billing_data_synthetic.csv")
print("Billing Data Sample:")
print(df.head())

# 2. Select features and target
# We'll use 'SubscriptionPlan' (categorical), 'PaymentStatus' (categorical), and 'Amount' (numeric).
features = df[["SubscriptionPlan", "PaymentStatus", "Amount"]]
target = df["Churn"]

# 3. One-hot encode the categorical columns
subscription_encoded = pd.get_dummies(features["SubscriptionPlan"], prefix="Plan")
payment_encoded = pd.get_dummies(features["PaymentStatus"], prefix="PayStatus")

# Combine these with the numeric 'Amount' column
features_encoded = pd.concat([subscription_encoded, payment_encoded, features["Amount"]], axis=1)

print("\nEncoded Features Sample:")
print(features_encoded.head())

# 4. Save the training columns for inference
training_columns = list(features_encoded.columns)
with open("billing_columns.json", "w") as f:
    json.dump(training_columns, f)
print("Training columns:", training_columns)

# 5. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# 6. Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# 7. Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 8. Save the trained model to a file for later use
joblib.dump(clf, "billing_model.joblib")
print("\nBilling model saved to billing_model.joblib")


Billing Data Sample:
      Billing_ID     UserID SubscriptionPlan  Amount Currency  \
0  BILL-c7053b74  USER-2130       Enterprise   71.16      USD   
1  BILL-f6fe7e56  USER-2625          Premium  324.95      USD   
2  BILL-dc5ebe15  USER-2433         Standard  277.07      USD   
3  BILL-11e27330  USER-5514          Premium  103.73      USD   
4  BILL-2c1d0e98  USER-2412          Premium  481.60      USD   

              TransactionDate  PaymentMethod PaymentStatus  Churn  
0  2024-07-15T17:09:28.759243    Credit Card       Pending      1  
1  2023-12-24T10:07:01.570484  Wire Transfer        Failed      0  
2  2024-09-24T16:10:16.351231    Credit Card     Completed      0  
3  2023-06-04T08:35:06.576073    Credit Card     Completed      0  
4  2025-02-19T06:57:55.954580  Wire Transfer       Pending      0  

Encoded Features Sample:
   Plan_Basic  Plan_Enterprise  Plan_Premium  Plan_Standard  \
0       False             True         False          False   
1       False            Fal

In [33]:
from IPython.display import FileLink

# For the raw .joblib file:
display(FileLink('billing_model.joblib'))

In [34]:
import pandas as pd
import boto3
from io import StringIO
import joblib
import json

# -------------------------------
# Step 1: Download client's Billing data from S3
# -------------------------------
bucket = "my-churnshield-data"
key = "Data Sources/billing_data.csv"  # Adjust if your file name/path is different

s3_client = boto3.client("s3", region_name="us-east-2")
obj = s3_client.get_object(Bucket=bucket, Key=key)
data = obj["Body"].read().decode("utf-8")

# Load the CSV data into a DataFrame
df_clients = pd.read_csv(StringIO(data))
print("Client Billing Data Sample:")
print(df_clients.head())

# -------------------------------
# Step 2: Preprocess data to extract features
# -------------------------------
# We have 'SubscriptionPlan' (categorical), 'PaymentStatus' (categorical), and 'Amount' (numeric).
subscription_encoded = pd.get_dummies(df_clients["SubscriptionPlan"], prefix="Plan")
payment_encoded = pd.get_dummies(df_clients["PaymentStatus"], prefix="PayStatus")

# Combine the encoded columns with the numeric 'Amount' column
features_encoded = pd.concat([subscription_encoded, payment_encoded, df_clients["Amount"]], axis=1)
print("\nEncoded Billing Features Sample:")
print(features_encoded.head())

# -------------------------------
# Step 3: Load the training columns from JSON
# -------------------------------
# This file should contain the exact feature columns used during training 
# (e.g., ["Plan_Basic", "Plan_Standard", ..., "PayStatus_Completed", ..., "Amount"])
with open("billing_columns.json", "r") as f:
    expected_columns = json.load(f)

print("\nColumns from training (billing_columns.json):")
print(expected_columns)

# Reindex the DataFrame to match the expected columns
features_encoded = features_encoded.reindex(columns=expected_columns, fill_value=0)
print("\nReindexed Encoded Features (should have", len(expected_columns), "columns):")
print(features_encoded.head())

# -------------------------------
# Step 4: Load the trained Billing-specific model
# -------------------------------
model = joblib.load("billing_model.joblib")

# -------------------------------
# Step 5: Generate churn predictions
# -------------------------------
predictions = model.predict(features_encoded)
df_clients["Predicted_Churn"] = predictions

# Display a few predictions alongside key columns
print("\nPredictions on Client Billing Data:")
print(df_clients[["Billing_ID", "SubscriptionPlan", "Amount", "PaymentStatus", "Predicted_Churn"]].head())

# -------------------------------
# Step 6: Save the DataFrame with predictions to a new CSV file
# -------------------------------
output_file = "billing_data_with_predictions.csv"
df_clients.to_csv(output_file, index=False)
print("\nSaved predictions to", output_file)


Client Billing Data Sample:
                                  Billing_ID     UserID SubscriptionPlan  \
0  BILL-27eba3ee-e298-4348-98ed-9fa8474ff727  USER-8359            Basic   
1  BILL-6682bc84-8806-42b5-ab80-df08b7f5b5ca  USER-3216       Enterprise   
2  BILL-c3d9c47b-032f-4c7e-8322-6549ee14b4c8  USER-5794       Enterprise   
3  BILL-44884268-8956-44a1-ab2a-37fc9277f0c9  USER-6822       Enterprise   
4  BILL-edecccfc-da79-423f-b17c-f8952bbd559f  USER-1518       Enterprise   

   Amount Currency             TransactionDate  PaymentMethod PaymentStatus  
0   30.39      USD  2024-12-28T20:41:39.615550  Wire Transfer       Pending  
1  145.33      USD  2024-11-28T00:04:24.702024         PayPal        Failed  
2  491.75      USD  2024-08-22T14:09:02.394898  Wire Transfer        Failed  
3  185.83      USD  2024-03-22T16:31:45.238658    Credit Card       Pending  
4  183.16      USD  2025-01-06T13:06:24.559926  Wire Transfer     Completed  

Encoded Billing Features Sample:
   Plan_Basic