In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime

# Load the training and testing datasets
train_data = pd.read_csv('/content/ekVGEMhRQDM33DRh.csv')  # Update the path
test_data = pd.read_csv('/content/xhMbeI3dNtFgZXNm.csv')    # Update the path

# Calculate call duration from CallStart and CallEnd
def calculate_call_duration(df):
    df['CallStart'] = pd.to_datetime(df['CallStart'], format='%H:%M:%S')
    df['CallEnd'] = pd.to_datetime(df['CallEnd'], format='%H:%M:%S')
    df['CallDuration'] = (df['CallEnd'] - df['CallStart']).dt.total_seconds()  # Duration in seconds
    return df.drop(['CallStart', 'CallEnd'], axis=1)

# Apply the function to both datasets
train_data = calculate_call_duration(train_data)
test_data = calculate_call_duration(test_data)

# Check for missing values in both datasets
print("Missing values in training data:\n", train_data.isnull().sum())
print("Missing values in testing data:\n", test_data.isnull().sum())

# Define features and target for training
X_train = train_data.drop('CarInsurance', axis=1)
y_train = train_data['CarInsurance']

# Features for testing
X_test = test_data.drop('CarInsurance', axis=1, errors='ignore')  # Ignore target in test data if absent

# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Fill missing values
X_train[numeric_cols] = X_train[numeric_cols].fillna(X_train[numeric_cols].mean())
X_train[categorical_cols] = X_train[categorical_cols].fillna(X_train[categorical_cols].mode().iloc[0])

X_test[numeric_cols] = X_test[numeric_cols].fillna(X_train[numeric_cols].mean())
X_test[categorical_cols] = X_test[categorical_cols].fillna(X_train[categorical_cols].mode().iloc[0])

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a logistic regression pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# If the test data includes the actual target variable (for validation):
# Check for missing values in y_test
if 'CarInsurance' in test_data.columns:
    y_test = test_data['CarInsurance']

    # Remove rows with missing values in y_test
    valid_indices = y_test.notnull()
    y_test = y_test[valid_indices]
    y_pred = y_pred[valid_indices]

    # Check if y_test and y_pred are empty
    if y_test.empty or len(y_pred) == 0:
        print("No valid data to evaluate. y_test or y_pred is empty.")
    else:
        # Evaluate the model only on valid rows
        print("Accuracy on test data:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
else:
    # If the target is not provided in the test data, just output the predictions
    predictions = pd.DataFrame({'Predicted_CarInsurance': y_pred})
    print(predictions)


Missing values in training data:
 Id                     0
Age                    0
Job                   19
Marital                0
Education            169
Default                0
Balance                0
HHInsurance            0
CarLoan                0
Communication        902
LastContactDay         0
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             3042
CarInsurance           0
CallDuration           0
dtype: int64
Missing values in testing data:
 Id                     0
Age                    0
Job                    5
Marital                0
Education             47
Default                0
Balance                0
HHInsurance            0
CarLoan                0
Communication        221
LastContactDay         0
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome              757
CarInsurance        1000
CallDuration           0
dtype: int64
No vali