In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Read the CSV data from the Google Sheets file
url = "https://docs.google.com/spreadsheets/d/1xLht2E9dicVRjXJpHwhLNXbfof_srub4SopmuPiXZWQ/export?format=csv"
df = pd.read_csv(url)

# Rename columns with friendly names
df.rename(columns={
    "age": "Age",
    "workclass": "Work Class",
    "education": "Education",
    "marital-status": "Marital Status",
    "occupation": "Occupation",
    "relationship": "Relationship",
    "race": "Race",
    "sex": "Gender",
    "hours-per-week": "Hours per Week",
    "native-country": "Native Country",
    "income": "Income"
}, inplace=True)

# Explore the dataset
print("Data Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nData Description:\n", df.describe(include="all"))

# Check for missing data
print("\nMissing Data:\n", df.isnull().sum())

# Check for outliers
numeric_cols = ["Age", "Hours per Week"]
numeric_df = df[numeric_cols]
z_scores = np.abs(StandardScaler().fit_transform(numeric_df))
outliers = np.where(z_scores > 3)
print("\nOutliers:\n", numeric_df.iloc[outliers])

# Explore the relationship between variables
sns.heatmap(df.corr(), annot=True)

# Explore the income distribution by gender
sns.histplot(data=df, x="Gender", hue="Income", multiple="stack")

# Explore the hours per week distribution by education level
sns.boxplot(data=df, x="Education", y="Hours per Week")

# Prepare the data for modeling
df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "Male" else 0)
df["Income"] = df["Income"].apply(lambda x: 1 if x == ">50K" else 0)
X = df.drop("Income", axis=1)
y = df["Income"]
X_encoded = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Train a logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Evaluate the model performance
print("\nClassification Report:\n", classification_report(y_test, y_pred))
