In [1]:
import sqlite3
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import ttest_ind, chi2_contingency
import numpy as np

# Connect to SQLite database
conn = sqlite3.connect('mydatabase.db')

In [2]:
# Load data from SQLite database into DataFrame
query = """
SELECT m.*, p.poutcome
FROM main_table m
JOIN p_outcome_table p ON m.id = p.id
"""
df = pd.read_sql_query(query, conn)
conn.close()

In [None]:
# Statistical Analysis: Hypothesis Testing
# T-test on 'balance' by 'poutcome'
outcome_groups = df[df['poutcome'].notna()].groupby('poutcome')['balance']
if len(outcome_groups) > 1:
    t_stat, p_val = ttest_ind(outcome_groups.get_group('success'), outcome_groups.get_group('failure'))
    print(f"T-test between success and failure balance: T-stat={t_stat}, P-value={p_val}")

In [4]:
# Machine Learning Integration: Classification
# Preprocessing
df = pd.get_dummies(df, drop_first=True)
X = df.drop('y_yes', axis=1)  # Assuming 'y_yes' is the target after encoding
y = df['y_yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Standard Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_scaled, y_train)
predictions_lr = model_lr.predict(X_test_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, predictions_lr))

In [None]:
# Decision Tree
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
predictions_dt = model_dt.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, predictions_dt))

In [None]:
# Gradient Boosting
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train_scaled, y_train)
predictions_gb = model_gb.predict(X_test_scaled)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, predictions_gb))

In [None]:
# Model Evaluation
accuracy_lr = accuracy_score(y_test, predictions_lr)
accuracy_dt = accuracy_score(y_test, predictions_dt)
accuracy_gb = accuracy_score(y_test, predictions_gb)
print(f"Accuracies:\nLogistic Regression: {accuracy_lr}\nDecision Tree: {accuracy_dt}\nGradient Boosting: {accuracy_gb}")

In [None]:
# Insights: Summarize Key Findings
# (This part is hypothetical as it depends on the actual analysis and results)
print("Key Insights:")
print(f"The Logistic Regression model performed with an accuracy of {accuracy_lr:.2f}.")
print(f"Decision trees had an accuracy of {accuracy_dt:.2f}, indicating [specific insight].")
print(f"Gradient Boosting showed an accuracy of {accuracy_gb:.2f}, which suggests [specific insight].")