In [None]:
import matplotlib.pyplot as plt
import numpy as np
import warnings
import pandas as pd
from sklearn.datasets import load_wine
warnings.filterwarnings('ignore')
%matplotlib inline

#### Домашнее задание № 1
Задача для прогнозирования предсказания возможного дохода
1. Проверьте данные на пропуски
2. Обучите логистическую регрессию
3. Обучите метод опорных векторов
4. Сравните точность двух моделей
5. Напишите выводы и интерпретируйте

In [None]:
df = pd.read_csv('adult.csv')
df.head(5)

In [None]:
### YOUR CODE# ================ IMPORTS ================
import matplotlib.pyplot as plt
import numpy as np
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

warnings.filterwarnings('ignore')
%matplotlib inline

# ================ LOAD DATA ================
print("=== Loading Dataset ===")
df = pd.read_csv('adult.csv')
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

# ================ STEP 1: CHECK FOR MISSING VALUES ================
print("\n" + "="*60)
print("STEP 1: Check for missing values")
print("="*60)

# Check for NaN values
print("\nMissing values (NaN):")
print(df.isnull().sum())

# Check for '?' values (this dataset uses '?' for missing values)
print("\nChecking for '?' values:")
for column in df.columns:
    if df[column].dtype == 'object':  # For categorical columns
        missing_count = (df[column] == '?').sum()
        if missing_count > 0:
            print(f"{column}: {missing_count} missing values ({missing_count/len(df)*100:.2f}%)")

# ================ DATA PREPROCESSING ================
print("\n" + "="*60)
print("Data Preprocessing")
print("="*60)

# Replace '?' with NaN
df.replace('?', pd.NA, inplace=True)

# Check missing values after replacement
missing_summary = df.isna().sum()
print("\nMissing values after replacing '?':")
print(missing_summary[missing_summary > 0])

# Drop rows with missing values
initial_size = len(df)
df = df.dropna()
print(f"\nDropped {initial_size - len(df)} rows with missing values")
print(f"New dataset shape: {df.shape}")

# Separate features and target
X = df.drop('income', axis=1)
y = df['income']

# Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64']).columns

print(f"\nCategorical columns: {list(categorical_cols)}")
print(f"Numerical columns: {list(numerical_cols)}")

# Encode categorical variables
print("\nEncoding categorical variables...")
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Encode target variable
y = LabelEncoder().fit_transform(y)
print("Target variable encoding:")
print("0 = <=50K, 1 = >50K")

# ================ TRAIN-TEST SPLIT ================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print(f"\nTraining set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Scale numerical features
print("\nScaling numerical features...")
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

# ================ STEP 2: LOGISTIC REGRESSION ================
print("\n" + "="*60)
print("STEP 2: Train Logistic Regression")
print("="*60)

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_lr = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, y_pred_lr)

print(f"\nLogistic Regression Accuracy: {lr_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['<=50K', '>50K']))

# Confusion Matrix for Logistic Regression
plt.figure(figsize=(8, 6))
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues',
            xticklabels=['<=50K', '>50K'],
            yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Feature importance for Logistic Regression
if len(lr_model.coef_) > 0:
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': abs(lr_model.coef_[0])
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features (Logistic Regression):")
    print(feature_importance.head(10))

# ================ STEP 3: SUPPORT VECTOR MACHINE ================
print("\n" + "="*60)
print("STEP 3: Train Support Vector Machine")
print("="*60)

# Train SVM (using linear kernel for speed and interpretability)
svm_model = SVC(kernel='linear', random_state=42, probability=True)
svm_model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_svm = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, y_pred_svm)

print(f"\nSVM Accuracy: {svm_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=['<=50K', '>50K']))

# Confusion Matrix for SVM
plt.figure(figsize=(8, 6))
cm_svm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Reds',
            xticklabels=['<=50K', '>50K'],
            yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix - SVM')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# ================ STEP 4: COMPARE MODELS ================
print("\n" + "="*60)
print("STEP 4: Compare Model Accuracy")
print("="*60)

print(f"\nLogistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print(f"Difference: {abs(lr_accuracy - svm_accuracy):.4f}")

# Visual comparison
plt.figure(figsize=(10, 6))
models = ['Logistic Regression', 'SVM']
accuracies = [lr_accuracy, svm_accuracy]

bars = plt.bar(models, accuracies, color=['blue', 'red'], alpha=0.7)
plt.ylim([min(accuracies) - 0.05, max(accuracies) + 0.05])
plt.ylabel('Accuracy')
plt.title('Model Comparison: Accuracy Scores')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Detailed comparison table
print("\nDetailed Performance Comparison:")
lr_report = classification_report(y_test, y_pred_lr, output_dict=True, target_names=['<=50K', '>50K'])
svm_report = classification_report(y_test, y_pred_svm, output_dict=True, target_names=['<=50K', '>50K'])

comparison_data = []
for class_name in ['<=50K', '>50K']:
    comparison_data.append({
        'Class': class_name,
        'LR_Precision': lr_report[class_name]['precision'],
        'SVM_Precision': svm_report[class_name]['precision'],
        'LR_Recall': lr_report[class_name]['recall'],
        'SVM_Recall': svm_report[class_name]['recall'],
        'LR_F1': lr_report[class_name]['f1-score'],
        'SVM_F1': svm_report[class_name]['f1-score']
    })

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

# ================ STEP 5: CONCLUSIONS ================
print("\n" + "="*60)
print("STEP 5: Conclusions and Interpretation")
print("="*60)

print("\n1. DATA QUALITY AND PREPROCESSING:")
print("   - Original dataset had missing values represented by '?'")
print("   - Removed rows with missing values to ensure data quality")
print("   - Encoded categorical variables using Label Encoding")
print("   - Scaled numerical features for model consistency")

print("\n2. MODEL PERFORMANCE SUMMARY:")
if lr_accuracy > svm_accuracy:
    print(f"   ✓ Logistic Regression performed better (by {lr_accuracy - svm_accuracy:.4f})")
    print(f"   - LR Accuracy: {lr_accuracy:.4f}")
    print(f"   - SVM Accuracy: {svm_accuracy:.4f}")
elif svm_accuracy > lr_accuracy:
    print(f"   ✓ SVM performed better (by {svm_accuracy - lr_accuracy:.4f})")
    print(f"   - SVM Accuracy: {svm_accuracy:.4f}")
    print(f"   - LR Accuracy: {lr_accuracy:.4f}")
else:
    print("   ✓ Both models performed equally well")

print("\n3. MODEL INTERPRETATION:")
print("   LOGISTIC REGRESSION:")
print("   - Provides probability estimates for predictions")
print("   - Feature coefficients show direction and strength of relationships")
print("   - Generally faster training and prediction times")
print("   - Assumes linear relationship between features and log-odds")

print("\n   SUPPORT VECTOR MACHINE:")
print("   - Creates maximum margin separator")
print("   - Can handle non-linear boundaries with different kernels")
print("   - More robust to outliers")
print("   - Can be computationally intensive on large datasets")

print("\n4. BUSINESS IMPLICATIONS:")
print("   - Both models can predict income >50K with reasonable accuracy")
print("   - False positives (incorrectly predicting high income):")
print("     * Wasted marketing resources")
print("   - False negatives (missing high-income individuals):")
print("     * Lost revenue opportunities")

print("\n5. RECOMMENDATIONS FOR IMPROVEMENT:")
print("   ✓ Handle missing values using imputation instead of removal")
print("   ✓ Try one-hot encoding for categorical variables")
print("   ✓ Experiment with other models (Random Forest, Gradient Boosting)")
print("   ✓ Perform hyperparameter tuning for both models")
print("   ✓ Use cross-validation for more reliable performance estimates")
print("   ✓ Address potential class imbalance")

print("\n6. FINAL RECOMMENDATION:")
if abs(lr_accuracy - svm_accuracy) < 0.01:
    print("   Both models perform similarly. Choose based on:")
    print("   - Logistic Regression for interpretability and speed")
    print("   - SVM for robustness to outliers")
elif lr_accuracy > svm_accuracy:
    print("   Recommend Logistic Regression due to:")
    print("   - Higher accuracy")
    print("   - Better interpretability")
    print("   - Faster computation")
else:
    print("   Recommend SVM due to higher accuracy, but consider:")
    print("   - Longer training time")
    print("   - More complex interpretation")

# ================ BONUS: ADDITIONAL ANALYSIS ================
print("\n" + "="*60)
print("BONUS: Additional Analysis")
print("="*60)

# Class distribution
class_dist = pd.Series(y).value_counts(normalize=True).sort_index()
print(f"\nClass Distribution in Target Variable:")
print(f"  <=50K (Class 0): {class_dist[0]:.2%}")
print(f"  >50K (Class 1): {class_dist[1]:.2%}")

# Check for class imbalance
if abs(class_dist[0] - class_dist[1]) > 0.2:
    print("\n⚠️  WARNING: Significant class imbalance detected!")
    print("   Consider using class_weight='balanced' in models")
    print("   or applying SMOTE for better minority class performance")
else:
    print("\n✓ Class distribution is reasonably balanced")

# Sample predictions comparison
print("\nSample Predictions Comparison (first 10 test samples):")
sample_comparison = pd.DataFrame({
    'True_Income': ['<=50K' if val == 0 else '>50K' for val in y_test[:10]],
    'LR_Prediction': ['<=50K' if val == 0 else '>50K' for val in y_pred_lr[:10]],
    'SVM_Prediction': ['<=50K' if val == 0 else '>50K' for val in y_pred_svm[:10]],
    'LR_Correct': y_test[:10] == y_pred_lr[:10],
    'SVM_Correct': y_test[:10] == y_pred_svm[:10]
})
print(sample_comparison)

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60):

#### Домашнее задание № 2

1. Из данных исключите объекты класса 2.
2. Отмасштабируйте признаки, используя класс `StandardScaler` с гиперпараметрами по умолчанию. 
3. Обучите логистическую регрессию и оцените важность признаков. 
4. Укажите название признака, который оказался наименее значимым.
5. Напишите выводы.

Обратите внимание, целевое значение лежит по ключу `'target'`, матрица объекты-признаки лежит по ключу `'data'`

In [None]:
data = load_wine()



In [None]:
### YOUR CODE:

### Домашнее задание № 3
В этой части мы будем работать с данными UCI Bank Marketing Dataset. Этот датасет содержит информацию о банковском телефонном маркетинге.

Объектом здесь является телефонный звонок потенциальному клиенту с предложением некоторой услуги (утверждается, что это краткосрочный депозит). В качестве признакового описания используются характеристики клиента (образование, брак и т.д.), более подробная информация представлена в файле bank-additional-names.txt. Целевая переменная - ответ клиента (согласился ли он открыть депозит?)

1. Закодируйте категориальные признаки 
2. Выберите метрику классификации, которая вам кажется подходящей, и обучите логистическую регрессию
3. Как вы считаете, что для вашего бизнеса важнее — хороший precision или recall модели? Почему?

In [None]:
df = pd.read_csv('bank-additional-full.csv', sep=';')
df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
### YOUR CODE: