# Logistic Regression

### Applt logistic regression to predict Household income using PyTorch

##### Helper class - logistic_regression.py

In [1]:
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import numpy as np
import pandas as pd

In [2]:
# PyTorch Logistic Regression Class
# See logistic_regression.py for detailed explaination

from logistic_regression import LogisticRegression

### Read data from .csv file

In [3]:
data = pd.read_csv('census.csv')

In [4]:
data.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


**Columns of the dataset**

In [5]:
list(data)

['age',
 'workclass',
 'education_level',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

** Seperate labels from data **

In [6]:
y = data[['income']]

**Drop labels column from data**

In [7]:
x = data.drop(['income'], axis=1, errors='ignore')

**Columns which are categorical , i.e. the ones which we want to convert to One_hot**

In [8]:
one_hot_encode_columns = ['workclass', 'education_level', 'marital-status', 'occupation', 'relationship',
                          'race', 'sex', 'native-country']

In [9]:
# Get One Hot Encoding
x = pd.get_dummies(x, columns=one_hot_encode_columns)

In [10]:
# Make labels 1 or 0 according to the condition
y = y.apply(lambda x: 1 if x['income'] == '>50K' else 0, axis=1)

In [11]:
x.shape, y.shape

((45222, 103), (45222,))

**Get train and test datasets**

In [12]:
x_train, x_test = np.asarray(x[:44000], dtype=np.float32), np.asarray(x[44000:], dtype=np.int32)
y_train, y_test = np.asarray(y[:44000], dtype=np.float32), np.asarray(y[44000:], dtype=np.int32)

In [13]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((44000, 103), (44000,), (1222, 103), (1222,))

## Train and Test model

** Model and methods define in logistic_regression.py **

In [14]:
num_epochs = 50
batch_size = 100
learning_rate = 0.001

In [15]:
# Train and test models
logistic_model = LogisticRegression(x_train, y_train)
logistic_model.train(batch_size, num_epochs, learning_rate)
logistic_model.test(x_test, y_test)

Accuracy of the model : 78 %
