In [2]:
import pandas as pd
from math import sqrt
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss


warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)



In [3]:
df = pd.read_csv('./adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


Data cleaning

In [4]:
try:
    df.replace('?', pd.NA, inplace=True)
except AttributeError or FutureWarning:
    pass
if 'capital.gain' and 'capital.loss' in df.columns:
    df.drop(columns=['capital.gain', 'capital.loss'], inplace=True)
df = df[df['income'].notna()]
df['income'].replace('>50K', 1, inplace=True)
df['income'].replace('<=50K', 0, inplace=True)
df


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,40,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,40,United-States,0
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,0
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,1
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,0


Categorical and numerical features

In [5]:
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
numerical_features = ['age', 'fnlwgt', 'education.num', 'hours.per.week']

In [6]:
def get_categorical_column_info(df, categorical_features):
    unique_values, nan_count = [], []
    for column_name in categorical_features:
        unique_values.append(df[column_name].nunique())  
        nan_count.append(df[column_name].isna().sum())   
    return pd.DataFrame({
        'column name': categorical_features,
        'unique values': unique_values,
        'number of nan values': nan_count
    })

get_categorical_column_info(df, categorical_features)

Unnamed: 0,column name,unique values,number of nan values
0,workclass,8,1836
1,education,16,0
2,marital.status,7,0
3,occupation,14,1843
4,relationship,6,0
5,race,5,0
6,sex,2,0
7,native.country,41,583


In [7]:
df[numerical_features].describe()

Unnamed: 0,age,fnlwgt,education.num,hours.per.week
count,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,40.437456
std,13.640433,105550.0,2.57272,12.347429
min,17.0,12285.0,1.0,1.0
25%,28.0,117827.0,9.0,40.0
50%,37.0,178356.0,10.0,40.0
75%,48.0,237051.0,12.0,45.0
max,90.0,1484705.0,16.0,99.0


In [8]:
def one_hot_encoding(df, categorical_columns):   
    for col in categorical_columns:
        col_unique_vals = df[col].unique()
        for val in col_unique_vals:
            if str(val)=='<NA>':
                df[f"{col}_nan"]=df[col].isin([val]).astype(int)
            else:
                df[f"{col}_{val}"]=df[col].isin([val]).astype(int)
        df.drop(columns=col, inplace=True)
    
    return df


In [9]:
one_hot_encoding(df, categorical_features)

Unnamed: 0,age,fnlwgt,education.num,hours.per.week,income,workclass_nan,workclass_Private,workclass_State-gov,workclass_Federal-gov,workclass_Self-emp-not-inc,...,native.country_Guatemala,native.country_Jamaica,native.country_Ecuador,native.country_France,native.country_Yugoslavia,native.country_Scotland,native.country_Portugal,native.country_Laos,native.country_Thailand,native.country_Outlying-US(Guam-USVI-etc)
0,90,77053,9,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,82,132870,9,18,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,66,186061,10,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,54,140359,4,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41,264663,10,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,310152,10,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32557,27,257302,12,38,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32558,40,154374,9,40,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32559,58,151910,9,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Z-score normalization

Info from https://www.geeksforgeeks.org/data-normalization-in-data-mining/

In [10]:
def normalize(obj):
    avg = sum(obj)/len(obj)
    std = sqrt(sum(x**2 for x in obj)/len(obj)-avg**2)
    return [(x-avg)/std for x in obj]

Mean scaling (result in range [0, 1])

Info from https://wellsr.com/python/data-scaling-and-normalization-with-python/

In [11]:
def mean_scale(obj):
    avg = sum(obj)/len(obj)
    range = max(obj) - min(obj)
    return [(x-avg)/range for x in obj]

In [12]:
df['age'] = normalize(df['age'])
df['fnlwgt'] = mean_scale(df['fnlwgt'])
df['education.num'] = normalize(df['education.num'])
df['hours.per.week'] = normalize(df['hours.per.week'])
df

Unnamed: 0,age,fnlwgt,education.num,hours.per.week,income,workclass_nan,workclass_Private,workclass_State-gov,workclass_Federal-gov,workclass_Self-emp-not-inc,...,native.country_Guatemala,native.country_Jamaica,native.country_Ecuador,native.country_France,native.country_Yugoslavia,native.country_Scotland,native.country_Portugal,native.country_Laos,native.country_Thailand,native.country_Outlying-US(Guam-USVI-etc)
0,3.769612,-0.076558,-0.420060,-0.035429,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.183112,-0.038650,-0.420060,-1.817204,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.010110,-0.002525,-0.031360,-0.035429,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.130359,-0.033563,-2.363558,-0.035429,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.177296,0.050858,-0.031360,-0.035429,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-1.215643,0.081752,-0.031360,-0.035429,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32557,-0.849080,0.045859,0.746039,-0.197409,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32558,0.103983,-0.024045,-0.420060,-0.035429,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32559,1.423610,-0.025718,-0.420060,-0.035429,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
X, y = df.drop('income', axis=1), df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

classificator = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
classificator.fit(X_train, y_train)

y_pred = classificator.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
F1_score = f1_score(y_test, y_pred, zero_division=0)
print("Accuracy: {:.2f} \nPrecision: {:.2f}  \nRecall: {:.2f}  \nF1-score: {:.2f}"
      .format(accuracy, precision, recall, F1_score))

Accuracy: 0.83 
Precision: 0.70  
Recall: 0.54  
F1-score: 0.61


In [43]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [44]:
def softmax(logits):
    """Compute the softmax values for a set of logits."""
    exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    softmax_vals = exps / np.sum(exps, axis=1, keepdims=True)
    return softmax_vals

In [45]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classificator = LogisticRegression(max_iter=1000, random_state=42)
classificator.fit(X_train, y_train)
y_probs = softmax(classificator.decision_function(X_test))

y_pred = np.argmax(y_probs, axis=1)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
F1_score = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print("Accuracy: {:.2f} \nPrecision: {:.2f}  \nRecall: {:.2f}  \nF1-score: {:.2f}"
      .format(accuracy, precision, recall, F1_score))


Accuracy: 1.00 
Precision: 1.00  
Recall: 1.00  
F1-score: 1.00


In [33]:
print(y_pred)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [34]:
print(y_test)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
