In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt


from pandas_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_excel("train.xlsx")
test_data = pd.read_excel('test.xlsx')
train_data, validation_data = train_test_split(train_data, test_size = 0.3)

In [3]:
train = train_data.set_index('CITIZEN_ID')
# Split Titles from the name. Titles will then be converted to gender.
train.insert(2,'Gender','')
train[['Gender','Name']] = train['Name'].str.split(' ', n=1, expand = True)
# Check unique titles
train['Gender'].unique()
# Replace with binary 1 = male 0 = female
train['Gender'] = train['Gender'].replace(['Mr.','Miss','Mrs.'],[1,0,1]).astype(np.uint8)

In [4]:
# Age
train.insert(2,'Year','')
train['Year'] = train['Birthday'].str.strip().str[-4:]
train['Year'] = train['Year'].astype(int)
train.insert(2,'Age','')
train['Age'] = 2048-train['Year']

train = train.drop(['Name','Birthday','Year'], axis = 1)

In [5]:
train.isin(['?']).sum(axis=0)

Age                         0
Gender                      0
Native Continent            0
Marital Status              0
Lives with                  0
Base Area                 264
Education Level             0
Years of Education          0
Employment Sector         851
Role                      856
Working Hours per week      0
Money Received              0
Ticket Price                0
Income                      0
dtype: int64

In [6]:
train = train[train['Base Area'] != '?']
train = train[train['Employment Sector'] != '?']
train = train[train['Role'] != '?']

In [7]:
# Work related features
train.insert(10,'Emp Role','')
train["Emp Role"] = train["Employment Sector"] + train["Role"]
train.insert(10,'Base Emp','')
train["Base Emp"] = train["Base Area"] + train["Employment Sector"]
train.insert(10,'Base Role','')
train["Base Role"] = train["Base Area"] + train["Role"]

In [8]:
train.shape

(14573, 17)

In [9]:
# Ticket and Money received binary
train.insert(16,'Ticket Bi','')
train['Ticket Bi'] = train['Ticket Price'].astype(bool).astype(np.uint8)
train.insert(16,'Money Bi','')
train['Money Bi'] = train['Money Received'].astype(bool).astype(np.uint8)

In [10]:
# Check number of 0 before binary
train.isin([0]).sum(axis=0)

Age                           0
Gender                     2135
Native Continent              0
Marital Status                0
Lives with                    0
Base Area                     0
Education Level               0
Years of Education            0
Employment Sector             0
Role                          0
Base Role                     0
Base Emp                      0
Emp Role                      0
Working Hours per week        0
Money Received            13339
Ticket Price              13869
Money Bi                  13339
Ticket Bi                 13869
Income                    10992
dtype: int64

In [11]:
train.head(5)

Unnamed: 0_level_0,Age,Gender,Native Continent,Marital Status,Lives with,Base Area,Education Level,Years of Education,Employment Sector,Role,Base Role,Base Emp,Emp Role,Working Hours per week,Money Received,Ticket Price,Money Bi,Ticket Bi,Income
CITIZEN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
14626,43,1,Europe,Divorced,Other relatives,Northbury,Bachelors + PostGraduation,16,Private Sector - Services,Machine Operators & Inspectors,NorthburyMachine Operators & Inspectors,NorthburyPrivate Sector - Services,Private Sector - Services Machine Operators & ...,40,0,0,0,0,0
20880,53,0,Europe,Single,Other Family,Northbury,Professional School,12,Private Sector - Services,Sales,NorthburySales,NorthburyPrivate Sector - Services,Private Sector - Services Sales,40,0,0,0,0,0
31427,62,1,Europe,Widow,Other Family,Northbury,Professional School,12,Private Sector - Services,Sales,NorthburySales,NorthburyPrivate Sector - Services,Private Sector - Services Sales,32,0,0,0,0,0
14909,34,1,Europe,Single,Other Family,Northbury,High School + PostGraduation,13,Public Sector - Others,Security,NorthburySecurity,NorthburyPublic Sector - Others,Public Sector - OthersSecurity,50,0,0,0,0,1
22542,41,1,Africa,Divorced,Alone,Northbury,Professional School,12,Private Sector - Services,Sales,NorthburySales,NorthburyPrivate Sector - Services,Private Sector - Services Sales,40,0,0,0,0,0


In [12]:
train_base = train.copy()

In [169]:
import phik

In [170]:
#train.phik_matrix().to_excel('phik explore1.xlsx')

In [171]:
train.nunique()

Age                        72
Gender                      2
Native Continent            5
Marital Status              7
Lives with                  6
Base Area                  39
Education Level            16
Years of Education         14
Employment Sector           7
Role                       14
Base Role                 321
Base Emp                  150
Emp Role                   78
Working Hours per week     93
Money Received            106
Ticket Price               76
Money Bi                    2
Ticket Bi                   2
Income                      2
dtype: int64

In [172]:
#train = train.drop(['Education Level'], axis = 1)
#train = train.drop(['Native Continent','Base Area'], axis = 1)

In [173]:
#Neverworked
#train.loc[train['Employment Sector'] == 'Never Worked', 'Role'] = "Never Worked"

In [174]:
train['Employment Sector'].unique()

array(['Private Sector - Services ', 'Self-Employed (Individual)',
       'Private Sector - Others', 'Public Sector - Others',
       'Self-Employed (Company)', 'Public Sector - Government',
       'Unemployed'], dtype=object)

In [175]:
#Drop a reference variable per column if you want to actually use it.
dummies_NC = pd.get_dummies(train['Native Continent'])
dummies_MS = pd.get_dummies(train['Marital Status'])
dummies_LW = pd.get_dummies(train['Lives with'])
dummies_BA = pd.get_dummies(train['Base Area'])
dummies_EL = pd.get_dummies(train['Education Level'])
dummies_ES = pd.get_dummies(train['Employment Sector'])
dummies_Ro = pd.get_dummies(train['Role'])
dummies_BR = pd.get_dummies(train['Base Role'])
dummies_BE = pd.get_dummies(train['Base Emp'])
dummies_ER = pd.get_dummies(train['Emp Role'])



In [176]:
train = train.join(dummies_NC)
train = train.join(dummies_MS)
train = train.join(dummies_LW)
train = train.join(dummies_BA)
train = train.join(dummies_EL)
train = train.join(dummies_ES)
train = train.join(dummies_Ro)
train = train.join(dummies_BR)
train = train.join(dummies_BE)
train = train.join(dummies_ER)

In [177]:
train['Income2'] = train['Income'].astype(np.uint8)
train = train.drop(['Income'], axis = 1)
train.rename(columns={'Income2':'Income'}, inplace=True)

In [178]:
train = train.convert_dtypes()

In [179]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14492 entries, 16807 to 21531
Data columns (total 662 columns):
 #   Column                                                    Dtype 
---  ------                                                    ----- 
 0   Age                                                       Int32 
 1   Gender                                                    UInt8 
 2   Native Continent                                          string
 3   Marital Status                                            string
 4   Lives with                                                string
 5   Base Area                                                 string
 6   Education Level                                           string
 7   Years of Education                                        Int64 
 8   Employment Sector                                         string
 9   Role                                                      string
 10  Base Role                                

In [180]:
train.phik_matrix().to_excel('phik explore-after.xlsx')

interval columns not set, guessing: ['Age', 'Gender', 'Years of Education', 'Working Hours per week', 'Money Received', 'Ticket Price', 'Money Bi', 'Ticket Bi', 'Africa', 'America', 'Asia', 'Europe', 'Oceania', 'Divorced', 'Married', 'Married - Spouse Missing', 'Married - Spouse in the Army', 'Separated', 'Single', 'Widow', 'Alone', 'Children', 'Husband', 'Other Family', 'Other relatives', 'Wife', 'Aberuthven', 'Aerilon', 'Alverton', 'Aroonshire', 'Auchenshuggle', 'Bellenau', 'Bellmoral', 'Butterpond', 'Carlisle', 'Cherrytown', 'Conriston', 'Drumchapel', 'Eelry', 'Fanfoss', "Fool's March", 'Ironforge', 'Kald', "King's Watch", 'Kirkwall', "Knife's Edge", 'Laenteglos', 'Laewaes', 'Lanercost', 'Lewes', 'Marnmouth', 'Mensfield', 'Middlesbrough', 'MillerVille', 'Northbury', 'Orilon', 'Pran', 'Redwick Bush', 'Sharnwick', 'Sharpton', 'Tranmere', 'Watford', 'Wigston', 'Willesden', 'Woodpine', 'Bachelors', 'Bachelors + PostGraduation', 'High School + PostGraduation', 'High School - 1st Cycle', 



In [None]:
train = train.drop(['Native Continent', 'Marital Status','Lives with','Base Area','Education Level','Employment Sector','Role','Base Role','Base Emp','Emp Role'], axis = 1) 

In [159]:
pearsoncorr = train.corr(method='pearson')
pearsoncorr.to_excel('pearson explore.xlsx')

In [123]:
#train_profile=ProfileReport(train, title='Training Dataset')

In [124]:
#train_profile.to_notebook_iframe()

In [125]:
#train_profile.to_file(output_file="Pandas Profiling Report — ML Correl .html")

In [26]:
train = train_base.copy()

In [32]:
train.nunique()

Age                                            71
Gender                                          2
Years of Education                             14
Working Hours per week                         90
Money Bi                                        2
Ticket Bi                                       2
Marital Status_Married                          2
Marital Status_Married - Spouse Missing         2
Marital Status_Married - Spouse in the Army     2
Marital Status_Separated                        2
Marital Status_Single                           2
Marital Status_Widow                            2
Lives with_Children                             2
Lives with_Husband                              2
Lives with_Other Family                         2
Lives with_Other relatives                      2
Lives with_Wife                                 2
Income                                          2
dtype: int64

In [28]:
train = pd.get_dummies(train, columns=['Marital Status', 'Lives with'],drop_first=True)

In [29]:
train = train.drop(['Native Continent','Base Area','Education Level','Employment Sector','Role','Base Role','Base Emp','Emp Role','Money Received','Ticket Price'], axis = 1)

In [30]:
train['Income2'] = train['Income'].astype(np.uint8)
train = train.drop(['Income'], axis = 1)
train.rename(columns={'Income2':'Income'}, inplace=True)

In [31]:
train.columns

Index(['Age', 'Gender', 'Years of Education', 'Working Hours per week',
       'Money Bi', 'Ticket Bi', 'Marital Status_Married',
       'Marital Status_Married - Spouse Missing',
       'Marital Status_Married - Spouse in the Army',
       'Marital Status_Separated', 'Marital Status_Single',
       'Marital Status_Widow', 'Lives with_Children', 'Lives with_Husband',
       'Lives with_Other Family', 'Lives with_Other relatives',
       'Lives with_Wife', 'Income'],
      dtype='object')

In [36]:
!pip install tensorflow
!pip install keras

Collecting tensorflow
  Downloading tensorflow-2.3.1-cp38-cp38-win_amd64.whl (342.5 MB)
Collecting absl-py>=0.7.0
  Downloading absl_py-0.11.0-py3-none-any.whl (127 kB)
Collecting astunparse==1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting keras-preprocessing<1.2,>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting tensorboard<3,>=2.3.0
  Downloading tensorboard-2.4.0-py3-none-any.whl (10.6 MB)
Collecting grpcio>=1.8.6
  Downloading grpcio-1.33.2-cp38-cp38-win_amd64.whl (2.7 MB)
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
Collecting google-pasta>=0.1.8
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collectin

In [19]:
conda update --all


Note: you may need to restart the kernel to use updated packages.Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\gogog\anaconda3


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _anaconda_depends-2020.07  |           py38_0           6 KB
    anaconda-custom            |           py38_1          36 KB
    anaconda-navigator-1.10.0  |           py38_0         4.9 MB
    asn1crypto-1.4.0           |             py_0          80 KB
    astropy-4.0.2              |   py38he774522_0         5.9 MB

    async_generator-1.10       |             py_0          24 KB
    attrs-20.3.0               |     pyhd3eb1b0_0          43 KB
    autopep8-1.5.4             |             py_0          42 KB
    babel-2.8.1                |     pyhd3eb1b0_0         5.3 MB
    backports-1.0   

DEBUG menuinst_win32:__init__(198): Menu: name: 'Anaconda${PY_VER} ${PLATFORM}', prefix: 'C:\Users\gogog\anaconda3', env_name: 'None', mode: 'user', used_mode: 'user'
DEBUG menuinst_win32:create(323): Shortcut cmd is C:\Users\gogog\anaconda3\pythonw.exe, args are ['C:\\Users\\gogog\\anaconda3\\cwp.py', 'C:\\Users\\gogog\\anaconda3', 'C:\\Users\\gogog\\anaconda3\\pythonw.exe', 'C:\\Users\\gogog\\anaconda3\\Scripts\\spyder-script.py']
DEBUG menuinst_win32:create(323): Shortcut cmd is C:\Users\gogog\anaconda3\python.exe, args are ['C:\\Users\\gogog\\anaconda3\\cwp.py', 'C:\\Users\\gogog\\anaconda3', 'C:\\Users\\gogog\\anaconda3\\python.exe', 'C:\\Users\\gogog\\anaconda3\\Scripts\\spyder-script.py', '--reset']
DEBUG menuinst_win32:__init__(198): Menu: name: 'Anaconda${PY_VER} ${PLATFORM}', prefix: 'C:\Users\gogog\anaconda3', env_name: 'None', mode: 'user', used_mode: 'user'
DEBUG menuinst_win32:create(323): Shortcut cmd is C:\Users\gogog\anaconda3\pythonw.exe, args are ['C:\\Users\\gogog\\

    conda-package-handling-1.7.2|   py38h76e460a_0         723 KB
    cryptography-3.2.1         |   py38hcd4344a_1         541 KB
    cython-0.29.21             |   py38hd77b12b_0         1.8 MB
    cytoolz-0.11.0             |   py38he774522_0         289 KB
    dask-2.30.0                |             py_0           5 KB
    dask-core-2.30.0           |             py_0         605 KB
    distributed-2.30.1         |   py38haa95532_0         1.0 MB
    flake8-3.8.4               |             py_0         128 KB
    freetype-2.10.4            |       hd328e21_0         466 KB
    fsspec-0.8.3               |             py_0          72 KB
    gevent-20.9.0              |   py38he774522_0         1.3 MB
    greenlet-0.4.17            |   py38he774522_0          23 KB
    importlib-metadata-2.0.0   |             py_1          35 KB
    importlib_metadata-2.0.0   |                1          11 KB
    iniconfig-1.1.1            |             py_0           9 KB
    intel-openmp-2020.2 

mkl-2020.2           | 109.3 MB  | #8         |  19% 
mkl-2020.2           | 109.3 MB  | #9         |  20% 
mkl-2020.2           | 109.3 MB  | ##         |  20% 
mkl-2020.2           | 109.3 MB  | ##         |  21% 
mkl-2020.2           | 109.3 MB  | ##1        |  21% 
mkl-2020.2           | 109.3 MB  | ##1        |  22% 
mkl-2020.2           | 109.3 MB  | ##2        |  22% 
mkl-2020.2           | 109.3 MB  | ##3        |  23% 
mkl-2020.2           | 109.3 MB  | ##3        |  24% 
mkl-2020.2           | 109.3 MB  | ##4        |  24% 
mkl-2020.2           | 109.3 MB  | ##5        |  25% 
mkl-2020.2           | 109.3 MB  | ##5        |  26% 
mkl-2020.2           | 109.3 MB  | ##6        |  26% 
mkl-2020.2           | 109.3 MB  | ##7        |  27% 
mkl-2020.2           | 109.3 MB  | ##7        |  28% 
mkl-2020.2           | 109.3 MB  | ##8        |  28% 
mkl-2020.2           | 109.3 MB  | ##9        |  29% 
mkl-2020.2           | 109.3 MB  | ##9        |  30% 
mkl-2020.2           | 109.3


pandas-1.1.3         | 7.5 MB    |            |   0% 
pandas-1.1.3         | 7.5 MB    | 6          |   7% 
pandas-1.1.3         | 7.5 MB    | #6         |  16% 
pandas-1.1.3         | 7.5 MB    | ##4        |  25% 
pandas-1.1.3         | 7.5 MB    | ###4       |  34% 
pandas-1.1.3         | 7.5 MB    | ####4      |  45% 
pandas-1.1.3         | 7.5 MB    | #####3     |  54% 
pandas-1.1.3         | 7.5 MB    | ######5    |  65% 
pandas-1.1.3         | 7.5 MB    | #######5   |  75% 
pandas-1.1.3         | 7.5 MB    | ########4  |  84% 
pandas-1.1.3         | 7.5 MB    | #########9 |  99% 
pandas-1.1.3         | 7.5 MB    | ########## | 100% 

bleach-3.2.1         | 112 KB    |            |   0% 
bleach-3.2.1         | 112 KB    | ########## | 100% 
bleach-3.2.1         | 112 KB    | ########## | 100% 

intel-openmp-2020.2  | 1.6 MB    |            |   0% 
intel-openmp-2020.2  | 1.6 MB    | ###5       |  36% 
intel-openmp-2020.2  | 1.6 MB    | ########8  |  88% 
intel-openmp-2020.2  | 1.

scipy-1.5.2          | 11.9 MB   | ##3        |  23% 
scipy-1.5.2          | 11.9 MB   | ###        |  31% 
scipy-1.5.2          | 11.9 MB   | ###8       |  38% 
scipy-1.5.2          | 11.9 MB   | ####4      |  45% 
scipy-1.5.2          | 11.9 MB   | #####2     |  52% 
scipy-1.5.2          | 11.9 MB   | #####8     |  58% 
scipy-1.5.2          | 11.9 MB   | ######4    |  65% 
scipy-1.5.2          | 11.9 MB   | #######    |  71% 
scipy-1.5.2          | 11.9 MB   | #######7   |  78% 
scipy-1.5.2          | 11.9 MB   | ########3  |  84% 
scipy-1.5.2          | 11.9 MB   | #########1 |  91% 
scipy-1.5.2          | 11.9 MB   | #########8 |  99% 
scipy-1.5.2          | 11.9 MB   | ########## | 100% 

pysocks-1.7.1        | 31 KB     |            |   0% 
pysocks-1.7.1        | 31 KB     | ########## | 100% 
pysocks-1.7.1        | 31 KB     | ########## | 100% 

jupyterlab-2.2.6     | 3.4 MB    |            |   0% 
jupyterlab-2.2.6     | 3.4 MB    | #6         |  16% 
jupyterlab-2.2.6     | 3.4

In [1]:
import tensorflow as tf

ImportError: Traceback (most recent call last):
  File "C:\Users\gogog\anaconda3\lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 64, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [13]:
from keras.models import Sequential
from keras.layers import Dense

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [None]:
# Random seed for reproducibility
numpy.random.seed(7)

In [18]:
from platform import python_version

print(python_version())

3.8.3
