# <span style='color:black'> Logistic Regression, SVC on Census Income Dataset </span>

# <span style='color:red'> 1.0 Importing required libraries </span>

In [1]:
### Pandas and Numpy
import pandas as pd
import numpy as np

### MongoDB Library
import pymongo
from pymongo import MongoClient

### To ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Machine Learning libraries
import sklearn
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

# <span style='color:red'> 2.0 Retrieving data from MongoDB </span>

In [2]:
### Retriving data from Mongodb
### creating connection with MongoDB

client=MongoClient('Localhost',27017)

In [3]:
db=client['census-income']
collection=db['census_income_data']

In [4]:
### Locating our collection and data in MongoDb using find() method

data_from_mongodb = collection.find()

In [5]:
### converting data from MongoDb to Dataframe in pandas

data_mongodb= pd.DataFrame(data_from_mongodb)

In [6]:
data_mongodb.head()

Unnamed: 0,_id,index,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,643d310edc31495c1fcb7f96,0,39,other,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,0
1,643d310edc31495c1fcb7f97,1,50,other,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,0
2,643d310edc31495c1fcb7f98,2,38,Private,215646,HS_grad,9,Divorced,other,Not_in_family,White,Male,0,0,40,United_States,0
3,643d310edc31495c1fcb7f99,3,53,Private,234721,other,7,Married_civ_spouse,other,Husband,other,Male,0,0,40,United_States,0
4,643d310edc31495c1fcb7f9a,4,28,Private,338409,Bachelors,13,Married_civ_spouse,Prof_specialty,other,other,Female,0,0,40,other,0


In [7]:
data_mongodb.drop(['_id','index'],axis=1,inplace=True)
data_mongodb.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,other,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,0
1,50,other,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,0
2,38,Private,215646,HS_grad,9,Divorced,other,Not_in_family,White,Male,0,0,40,United_States,0
3,53,Private,234721,other,7,Married_civ_spouse,other,Husband,other,Male,0,0,40,United_States,0
4,28,Private,338409,Bachelors,13,Married_civ_spouse,Prof_specialty,other,other,Female,0,0,40,other,0


In [8]:
data_mongodb.shape

(48813, 15)

# <span style='color:red'> 3.0 Model and Evaluation </span>

# <span style='color:red'> 3.1 Seperating Independent and Dependent features </span>

In [9]:
### Splitting data into independent feature dataframe and dependent feature series

X= data_mongodb.iloc[:,: :-1]
y=data_mongodb.iloc[:,-1]
X.head()

Unnamed: 0,salary,native_country,hours_per_week,capital_loss,capital_gain,sex,race,relationship,occupation,marital_status,education_num,education,fnlwgt,workclass,age
0,0,United_States,40,0,2174,Male,White,Not_in_family,Adm_clerical,Never_married,13,Bachelors,77516,other,39
1,0,United_States,13,0,0,Male,White,Husband,Exec_managerial,Married_civ_spouse,13,Bachelors,83311,other,50
2,0,United_States,40,0,0,Male,White,Not_in_family,other,Divorced,9,HS_grad,215646,Private,38
3,0,United_States,40,0,0,Male,other,Husband,other,Married_civ_spouse,7,other,234721,Private,53
4,0,other,40,0,0,Female,other,other,Prof_specialty,Married_civ_spouse,13,Bachelors,338409,Private,28


In [10]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

# <span style='color:red'> 3.2 Train Test Split </span>

In [11]:
### random state train test split will be same with all people using random_state=19
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.25,random_state=19)

In [12]:
X_train.head()

Unnamed: 0,salary,native_country,hours_per_week,capital_loss,capital_gain,sex,race,relationship,occupation,marital_status,education_num,education,fnlwgt,workclass,age
34576,0,United_States,40,0,0,Female,White,other,Adm_clerical,Married_civ_spouse,13,Bachelors,156383,Private,33
33148,1,United_States,48,0,7298,Male,White,Husband,Prof_specialty,Married_civ_spouse,15,other,293073,Private,29
2109,1,United_States,40,0,5556,Male,White,Husband,Sales,Married_civ_spouse,10,Some_college,146454,Private,66
33501,0,United_States,50,0,0,Male,White,Own_child,Adm_clerical,Never_married,9,HS_grad,260265,Private,19
47110,0,United_States,50,0,0,Male,White,Not_in_family,other,Divorced,9,HS_grad,86939,Private,23


In [13]:
y_train.head()

34576    0
33148    1
2109     1
33501    0
47110    0
Name: salary, dtype: int64

In [14]:
X_test.head()

Unnamed: 0,salary,native_country,hours_per_week,capital_loss,capital_gain,sex,race,relationship,occupation,marital_status,education_num,education,fnlwgt,workclass,age
3436,0,United_States,40,0,0,Male,White,Husband,Craft_repair,Married_civ_spouse,9,HS_grad,220585,Private,38
16332,0,United_States,60,0,0,Male,White,Not_in_family,other,Never_married,9,HS_grad,37778,other,38
39798,0,United_States,25,0,2407,Male,White,Husband,Craft_repair,Married_civ_spouse,6,other,127539,other,58
12405,0,United_States,8,0,0,Female,White,Own_child,Sales,Never_married,7,other,216137,Private,17
7584,0,United_States,20,0,0,Male,other,Own_child,Adm_clerical,Never_married,7,other,115630,Private,18


In [15]:
y_test.head()

3436     0
16332    0
39798    0
12405    0
7584     0
Name: salary, dtype: int64

In [16]:
X_train.shape,y_train.shape

((36609, 15), (36609,))

In [17]:
X_test.shape,y_test.shape

((12204, 15), (12204,))

In [18]:
data_mongodb.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

# <span style='color:red'> 3.3 Feature Encoding </span>

In [19]:
column_trans=make_column_transformer((OneHotEncoder(), ['workclass','education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']),
                                    remainder='passthrough')

In [20]:
X_train=column_trans.fit_transform(X_train)

In [21]:
X_test=column_trans.transform(X_test)

# <span style='color:red'> 3.4 Feature Scaling </span>

In [22]:
scaler=StandardScaler()
scaler

StandardScaler()

In [23]:
X_train=scaler.fit_transform(X_train)
X_train

array([[ 0.57666687, -0.57666687,  2.24301956, ...,  1.13261956,
        -0.31433281, -0.41221964],
       [ 0.57666687, -0.57666687, -0.44582759, ...,  1.91180699,
         0.98128957, -0.70482448],
       [ 0.57666687, -0.57666687, -0.44582759, ..., -0.03616158,
        -0.40844528,  2.00177035],
       ...,
       [ 0.57666687, -0.57666687, -0.44582759, ..., -0.4257553 ,
         0.02260105, -0.41221964],
       [-1.73410343,  1.73410343, -0.44582759, ..., -0.03616158,
        -0.60295452,  0.46559491],
       [ 0.57666687, -0.57666687,  2.24301956, ...,  1.13261956,
         2.39005735, -1.07058054]])

In [24]:
X_test=scaler.transform(X_test)
X_test

array([[ 0.57666687, -0.57666687, -0.44582759, ..., -0.4257553 ,
         0.29420877, -0.04646358],
       [-1.73410343,  1.73410343, -0.44582759, ..., -0.4257553 ,
        -1.43853569, -0.04646358],
       [-1.73410343,  1.73410343, -0.44582759, ..., -1.59453644,
        -0.58773197,  1.41656066],
       ...,
       [ 0.57666687, -0.57666687, -0.44582759, ..., -0.03616158,
         0.34653982, -0.04646358],
       [-1.73410343,  1.73410343, -0.44582759, ..., -2.37372387,
        -0.07176735,  2.80643369],
       [ 0.57666687, -0.57666687, -0.44582759, ..., -0.4257553 ,
        -0.9322771 , -1.21688297]])

In [25]:
X_train_scaled=pd.DataFrame(X_train)
X_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.576667,-0.576667,2.24302,-0.690827,-0.534593,-0.637338,-0.396985,1.089877,-0.702115,-0.288004,...,-1.417087,0.303837,-0.303837,-0.562366,-0.03188,-0.217955,-0.144306,1.13262,-0.314333,-0.41222
1,0.576667,-0.576667,-0.445828,-0.690827,-0.534593,1.569026,-0.396985,1.089877,-0.702115,-0.288004,...,0.705673,0.303837,-0.303837,1.778203,0.615701,-0.217955,0.846865,1.911807,0.98129,-0.704824
2,0.576667,-0.576667,-0.445828,-0.690827,1.870582,-0.637338,-0.396985,1.089877,-0.702115,-0.288004,...,0.705673,0.303837,-0.303837,1.778203,-0.03188,-0.217955,0.610277,-0.036162,-0.408445,2.00177
3,0.576667,-0.576667,-0.445828,1.447541,-0.534593,-0.637338,-0.396985,-0.917535,1.424268,-0.288004,...,0.705673,0.303837,-0.303837,-0.562366,0.777597,-0.217955,-0.144306,-0.425755,0.670317,-1.436337
4,0.576667,-0.576667,-0.445828,1.447541,-0.534593,-0.637338,2.518989,-0.917535,-0.702115,-0.288004,...,0.705673,0.303837,-0.303837,-0.562366,0.777597,-0.217955,-0.144306,-0.425755,-0.972561,-1.143732


In [26]:
X_test_scaled=pd.DataFrame(X_test)
X_test_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.576667,-0.576667,-0.445828,1.447541,-0.534593,-0.637338,-0.396985,1.089877,-0.702115,-0.288004,...,0.705673,0.303837,-0.303837,-0.562366,-0.03188,-0.217955,-0.144306,-0.425755,0.294209,-0.046464
1,-1.734103,1.734103,-0.445828,1.447541,-0.534593,-0.637338,-0.396985,-0.917535,1.424268,-0.288004,...,0.705673,0.303837,-0.303837,-0.562366,1.587074,-0.217955,-0.144306,-0.425755,-1.438536,-0.046464
2,-1.734103,1.734103,-0.445828,-0.690827,-0.534593,1.569026,-0.396985,1.089877,-0.702115,-0.288004,...,0.705673,0.303837,-0.303837,-0.562366,-1.246096,-0.217955,0.182598,-1.594536,-0.587732,1.416561
3,0.576667,-0.576667,-0.445828,-0.690827,-0.534593,1.569026,-0.396985,-0.917535,1.424268,-0.288004,...,-1.417087,0.303837,-0.303837,-0.562366,-2.622207,-0.217955,-0.144306,-1.204943,0.252048,-1.582639
4,0.576667,-0.576667,-0.445828,-0.690827,-0.534593,1.569026,-0.396985,-0.917535,1.424268,-0.288004,...,0.705673,0.303837,-0.303837,-0.562366,-1.650835,-0.217955,-0.144306,-1.204943,-0.700612,-1.509488


# <span style='color:red'> 3.5 Logestic Regression Model </span>

In [27]:
log_reg=LogisticRegression()
log_reg

LogisticRegression()

In [28]:
log_reg.fit(X_train,y_train)

LogisticRegression()

In [29]:
log_reg_predict=log_reg.predict(X_test)
log_reg_predict

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [30]:
conf_mat = confusion_matrix(y_test,log_reg_predict)
conf_mat

array([[9315,    0],
       [   0, 2889]], dtype=int64)

In [31]:
truely_positive=conf_mat[0,0]
falsely_positive=conf_mat[0,1]
falsely_neagtive=conf_mat[1,0]
truely_negative=conf_mat[1,1]

In [32]:
class_report=classification_report(y_test,log_reg_predict)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9315
           1       1.00      1.00      1.00      2889

    accuracy                           1.00     12204
   macro avg       1.00      1.00      1.00     12204
weighted avg       1.00      1.00      1.00     12204



# <span style='color:red'> 3.6 Support Vector Classifier Model </span>

In [33]:
X_train1,X_test1,y_train1,y_test1=train_test_split(X,y,test_size=0.25,random_state=19)

In [34]:
column_trans_svc= make_column_transformer((OneHotEncoder(),['workclass','education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']),
                                    remainder='passthrough')

In [35]:
X_train1=column_trans_svc.fit_transform(X_train1)

In [36]:
X_test1=column_trans_svc.fit_transform(X_test1)

In [37]:
scaler_svc=StandardScaler()
scaler_svc

StandardScaler()

In [38]:
X_train1=scaler_svc.fit_transform(X_train1)

In [39]:
X_test1=scaler_svc.fit_transform(X_test1)

In [40]:
svc=SVC()
svc

SVC()

In [41]:
svc.fit(X_train1,y_train1)

SVC()

In [42]:
svc_pred=svc.predict(X_test1)
svc_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [43]:
conf_mat=confusion_matrix(y_test,svc_pred)
conf_mat

array([[9315,    0],
       [   0, 2889]], dtype=int64)

In [44]:
truly_positive=conf_mat[0][0]
falsely_positive=conf_mat[0][1]
falsely_negative=conf_mat[1][0]
truly_negative=conf_mat[1][1]

In [45]:
classification_rep_svc=classification_report(y_test1, svc_pred)
print(classification_rep_svc)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9315
           1       1.00      1.00      1.00      2889

    accuracy                           1.00     12204
   macro avg       1.00      1.00      1.00     12204
weighted avg       1.00      1.00      1.00     12204



# <span style='color:red'> 3.7 Hyper-Parameter Tuning Logistic Regression Model </span>

In [46]:
param_grid =[ {'penalty':['l1','l2','elasticnet','none'],
                'C':np.logspace(-4,4,5),
              'solver':['lbfgs','newton-cg','liblinear','sag','saga'],
              'max_iter':[100,500]}]

In [47]:
log_reg_hyp=LogisticRegression()
log_reg_hyp

LogisticRegression()

In [48]:
hyp_log_reg=GridSearchCV(log_reg_hyp, param_grid=param_grid)

In [49]:
best_hyp_log_reg=hyp_log_reg.fit(X_train,y_train)
best_hyp_log_reg

GridSearchCV(estimator=LogisticRegression(),
             param_grid=[{'C': array([1.e-04, 1.e-02, 1.e+00, 1.e+02, 1.e+04]),
                          'max_iter': [100, 500],
                          'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                          'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag',
                                     'saga']}])

In [50]:
### getting best parameters for Logistic Regression model after gridsearchCV
print('best parameter are {} for optimal accuracy'.format(best_hyp_log_reg.best_estimator_))

best parameter are LogisticRegression(C=0.0001, penalty='l1', solver='liblinear') for optimal accuracy


In [51]:
### getting best accuracy for Logistic Regression model after gridsearchCV
print('best accuracy is {}'.format(best_hyp_log_reg.score(X_test,y_test)))

best accuracy is 1.0


# <span style='color:red'> 3.8 Hyper-Parameter Tuning Support Vector Classifier Model </span>

In [52]:
svc_hyp=SVC()
svc_hyp

SVC()

In [53]:
#### using gridsearchcv to increase model efficiency by combining above parameters

param_grid = {'C':[1,2,3],'kernel':['rbf']}

hyp_svc=GridSearchCV(svc_hyp,param_grid= param_grid)


In [54]:
best_hyp_svc= hyp_svc.fit(X_train1,y_train1)
best_hyp_svc

GridSearchCV(estimator=SVC(), param_grid={'C': [1, 2, 3], 'kernel': ['rbf']})

In [55]:
### getting best parameters for Logistic Regression model after gridsearchCV
print('best parameter optimal {}'.format(best_hyp_svc.best_estimator_))

best parameter optimal SVC(C=1)


In [56]:
### getting best accuracy for Logistic Regression model after gridsearchCV
print('best accuracy {}'.format(best_hyp_svc.score(X_test1,y_test1)))

best accuracy 1.0
