In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Sample data as we don't have user data currently
data = {
    "cluster": ["A", "B", "A", "C", "B", "C", "A"],
    "user_feedback": [5, 4, 2, 3, 1, 5, 4],
    "cluster_false": [0, 0, 1, 0, 1, 0, 0],
}
df = pd.DataFrame(data)

label_encoder = LabelEncoder()
df["cluster_encoded"] = label_encoder.fit_transform(df["cluster"])
# LabelEncoder: Encodes categorical text labels into numerical form.
# fit_transform: Fits the encoder to the unique values in the "cluster" column and transforms the column into numerical values. For example:
# "A" → 0
# "B" → 1
# "C" → 2

X = df[["cluster_encoded", "user_feedback"]]
y = df["cluster_false"]
# X: Features used for prediction (independent variables). It includes:
# cluster_encoded: Encoded cluster labels.
# user_feedback: User feedback scores.
# y: Target variable (dependent variable). Here, cluster_false indicates whether the cluster label is false.

# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for scaling and logistic regression
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("log_reg", LogisticRegression())
])
# Pipeline: Combines multiple steps into a single workflow:
# scaler: Scales the input features using StandardScaler.
# log_reg: Fits a logistic regression model to the scaled data.

pipeline.fit(X_train, y_train)
# fit: Trains the entire pipeline (scaling + logistic regression) using the training data.

y_pred = pipeline.predict(X_test)
# predict: Generates predictions for the test data using the trained pipeline.

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# accuracy_score: Calculates the overall accuracy of predictions.
# classification_report: Provides detailed metrics like precision, recall, and F1-score for the classification results.

# Predict probabilities for a sample new input
sample_input = pd.DataFrame({"cluster_encoded": [1], "user_feedback": [3]})
# sample_input: Creates a new data point with:
# Cluster "B" (encoded as 1).
# User feedback score of 3.

probabilities = pipeline.predict_proba(sample_input)
print("Probability of satisfying answer:", probabilities[0][1])
# predict_proba: Returns the predicted probabilities for each class.
# probabilities[0][1]: The probability of the positive class (cluster_false = 1), indicating the likelihood of the cluster label being false.


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Probability of satisfying answer: 0.3620892124337633
