In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as plt
%matplotlib inline 

In [20]:
df = pd.read_csv('adult 2.csv')

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [22]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [23]:
df['country'] = df['native.country'].replace(' ?',np.nan)
df['workclass'] = df['workclass'].replace(' ?',np.nan)
df['occupation'] = df['occupation'].replace(' ?',np.nan)

df.dropna(how='any',inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 16 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
country           32561 non-null object
dtypes: int64(6), object(10)
memory usage: 4.2+ MB


In [26]:
#convert dependent variable to binomial integer 
salary_map={' <=50K':1,' >50K':0}
df['salary']=df['income'].map(salary_map).astype(int)



ValueError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
sns.countplot(x='salary', data = df) 

In [None]:
sns.countplot(df['sex'],hue=df['salary'])

In [None]:
sns.boxplot(y='age',x='salary',data=df)

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.info()

In [None]:
X = df.drop(['salary'], axis=1)
y = df['salary']

split_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=split_size,random_state=0)


In [None]:
print(y.value_counts())

In [None]:
print("Train dataset: {0}{1}".format(X_train.shape, y_train.shape))
print("Test dataset: {0}{1}".format(X_test.shape, y_test.shape))

In [None]:
#create an instance and fit the model 
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

In [None]:
#predictions
prediction = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print('-'*40)
print('Accuracy Score:')
print(accuracy_score(y_test, prediction))

print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, prediction))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, prediction))

#what's happening here? 

In [None]:
#smote
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
print(pd.Series(y_train_resampled).value_counts())

In [None]:
logmodel = LogisticRegression(C=100, solver='liblinear')
logmodel.fit(X_train_resampled, y_train_resampled)

In [None]:
prediction = logmodel.predict(X_test)

In [None]:
print('-'*40)
print('Accuracy Score:')
print(accuracy_score(y_test, prediction))

print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, prediction))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, prediction))

## Regularization (Ridge and Lasso)

Ridge and Lasso regularizations are also known as ‘shrinkage’ methods, because they reduce or shrink the coefficients in the resulting regression. This reduces the variance in the model: as input variables are changed, the model’s prediction changes less than it would have without the regularization. Why would you want to reduce the variance of a model? To avoid overfit.

Regularization is extremely important in logistic regression modeling. Without regularization, the asymptotic nature of logistic regression would keep driving loss towards 0 in high dimensions.

* We must standardize before performing regularization. Why? 

**Advantages of L1 and L2 regularization:**
- L2
    - when you have features with high multicollinearity
    - reduce model complexity
    - features with large coefficients
    
- L1 
    - when you have a lot of small coefficients 
    - when you have LOTS of features 

**LogisticRegression has several optional parameters that define the behavior of the model and approach:**

**penalty**- is a string ('l2' by default) that decides whether there is regularization and which approach to use. Other options are 'l1', 'elasticnet', and 'none'.

**dual**- is a Boolean (False by default) that decides whether to use primal (when False) or dual formulation (when True).

**tol**- is a floating-point number (0.0001 by default) that defines the tolerance for stopping the procedure.

**C**- is a positive floating-point number (1.0 by default) that defines the relative strength of regularization. Smaller values indicate stronger regularization.

**fit_intercept**- is a Boolean (True by default) that decides whether to calculate the intercept 𝑏₀ (when True) or consider it equal to zero (when False).

**intercept_scaling**- is a floating-point number (1.0 by default) that defines the scaling of the intercept 𝑏₀.

**class_weight**- is a dictionary, 'balanced', or None (default) that defines the weights related to each class. When None, all classes have the weight one.

**random_state**- is an integer, an instance of numpy.RandomState, or None (default) that defines what pseudo-random number generator to use.

**solver**- is a string ('liblinear' by default) that decides what solver to use for fitting the model. Other options are 'newton-cg', 'lbfgs', 'sag', and 'saga'.

**max_iter**- is an integer (100 by default) that defines the maximum number of iterations by the solver during model fitting.

**multi_class**- is a string ('ovr' by default) that decides the approach to use for handling multiple classes. Other options are 'multinomial' and 'auto'.

**verbose**- is a non-negative integer (0 by default) that defines the verbosity for the 'liblinear' and 'lbfgs' solvers.

**warm_start**- is a Boolean (False by default) that decides whether to reuse the previously obtained solution.

**n_jobs**- is an integer or None (default) that defines the number of parallel processes to use. None usually means to use one core, while -1 means to use all available cores.

**l1_ratio**- is either a floating-point number between zero and one or None (default). It defines the relative importance of the L1 part in the elastic-net regularization.

**You should carefully match the solver and regularization method for several reasons:**

'liblinear' solver doesn’t work without regularization.
'newton-cg', 'sag', 'saga', and 'lbfgs' don’t support L1 regularization.
'saga' is the only solver that supports elastic-net regularization.

In [None]:
C = [100, 10, 1, .1, .001]
for c in C:
    logmodel = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    logmodel.fit(X_train_resampled, y_train_resampled)
    print('C:', c)
    print('Training accuracy:', logmodel.score(X_train_resampled, y_train_resampled))
    print('Test accuracy:', logmodel.score(X_test, y_test))
    print('')

### Comparing Logistic Regression with Other Models

Advantages of logistic regression:

- Highly interpretable (if you remember how)
- Model training and prediction are fast
- Not many parameters to tune
- Can perform well with a small number of observations
- Outputs well-calibrated predicted probabilities

Disadvantages of logistic regression:

- Presumes a linear relationship between the features and the log-odds of the response
- Performance is (generally) not competitive with the best supervised learning methods
- Can't automatically learn feature interactions