In [2]:
%matplotlib inline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Classification with Regularization Demo

## Income dataset

> Prediction task is to determine whether a person makes over 50K a year.

Дали доходът на човек е под или над 50К$. [Данни](https://archive.ics.uci.edu/ml/datasets/adult)

### Prepare data

In [14]:
income_data = pd.read_csv("adult/adult.data", header=None)
income_data.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [15]:
income_data.columns = [
    "age",
    "workclass",
    "final_weight",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_class",
]

In [16]:
income_attributes = income_data.drop(columns="income_class")
income_target = income_data.income_class

In [17]:
income_attributes.shape, income_target.shape

((32561, 14), (32561,))

In [18]:
income_attributes = pd.get_dummies(income_attributes, drop_first=True)
income_attributes.shape

(32561, 100)

In [19]:
# ensure data is numeric
income_attributes.dtypes.unique()

array([dtype('int64'), dtype('uint8')], dtype=object)

In [20]:
scaler = MinMaxScaler()
income_attributes = scaler.fit_transform(income_attributes)
income_attributes.shape

(32561, 100)

### Model data

In [25]:
income_attributes_train, income_attributes_test, \
income_target_train, income_target_test, \
= train_test_split(income_attributes, income_target, train_size=0.8, test_size=0.2)


for group in [income_attributes_train, income_attributes_test, income_target_train, income_target_test]:
    print(group.shape)

(26048, 100)
(6513, 100)
(26048,)
(6513,)


#### Effect of BIG regularization

In [42]:
Cs = np.logspace(-6, 2, 9)
coef_n = 0

result = []
for C in Cs:
    model = LogisticRegression(C=C, max_iter=500)
    model.fit(income_attributes_train, income_target_train)
    weight = model.coef_[0][coef_n]
    predicted_targets = set(model.predict(income_attributes_train))
    
    result.append((C, weight, predicted_targets))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
columns = ["C", "weight", "predicted_targets"]
pd.DataFrame(data=result, columns=columns)

Unnamed: 0,C,weight,predicted_targets
0,1e-06,0.000484,{ <=50K}
1,1e-05,0.004748,{ <=50K}
2,0.0001,0.040077,{ <=50K}
3,0.001,0.199999,"{ <=50K, >50K}"
4,0.01,0.735861,"{ <=50K, >50K}"
5,0.1,1.59177,"{ <=50K, >50K}"
6,1.0,1.835464,"{ <=50K, >50K}"
7,10.0,1.862918,"{ <=50K, >50K}"
8,100.0,1.864926,"{ <=50K, >50K}"


При голяма регуляризация (т.е. малко С):
1) тегловният коефициент w на практика  изчезва, става 0.
2) моделът предсказва САМО 1 клас - majority класа "<=50K", този, с повече записи. Виж долу.

In [44]:
income_target_train.value_counts()

 <=50K    19805
 >50K      6243
Name: income_class, dtype: int64