In [None]:
"""
Load data from postgres database.
Merge and examine data.

Downloaded csv files are stored locally
on PostgreSQL database 'hr_analytics'
List of relations:
- candidate
- candidate_test
- test_target

"""

In [1]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql

# We are also going to do some basic viz
import matplotlib.pyplot as plt
%matplotlib inline

# There is a bug in matplotlib. You cannot set the rc parameters in the same
# cell that you use the "%matplotlib inline" magic command
plt.style.use('ggplot')
plt.rc('font', size=18)

In [2]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'hr_analytics',    # DB that we are connecting to
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = False

In [3]:
# Load train data
query_train = "SELECT * FROM candidate;"
df_train = pd_sql.read_sql(query_train, connection)
df_train

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.7759999999999999,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [4]:
# Load test data
query_test = "SELECT * FROM candidate_test;"

df_test= pd_sql.read_sql(query_test, connection)
df_test

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,32403,city_41,0.8270000000000001,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
1,9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
2,31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
3,27385,city_13,0.8270000000000001,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
4,27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,1289,city_103,0.92,Male,No relevent experience,no_enrollment,Graduate,Humanities,16,,Public Sector,4,15
2125,195,city_136,0.897,Male,Has relevent experience,no_enrollment,Masters,STEM,18,,,2,30
2126,31762,city_100,0.887,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,18
2127,7873,city_102,0.804,Male,Has relevent experience,Full time course,High School,,7,100-500,Public Sector,1,84


In [5]:
# Load test target
query_test_target = "SELECT enrollee_id, target FROM test_target;"

df_test_target = pd_sql.read_sql(query_test_target, connection)
df_test_target

Unnamed: 0,enrollee_id,target
0,32403,1.0
1,9858,0.0
2,31806,1.0
3,27385,0.0
4,27724,1.0
...,...,...
2124,1289,0.0
2125,195,1.0
2126,31762,0.0
2127,7873,0.0


In [6]:
# SQL join test data and test target

query_test_target = """
WITH target AS(
    SELECT
        enrollee_id,
        target
    FROM test_target
)

SELECT
    candidate_test.enrollee_id,
    city,
    city_development_index,
    gender,
    relevent_experience,
    enrolled_university,
    education_level,
    major_discipline,
    experience,
    company_size,
    company_type,
    last_new_job,
    training_hours,
    target
FROM candidate_test
    LEFT JOIN target
        ON candidate_test.enrollee_id = target.enrollee_id;
"""

result = pd_sql.read_sql(query_test_target, connection)
result

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,32403,city_41,0.8270000000000001,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21,1.0
1,9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98,0.0
2,31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15,1.0
3,27385,city_13,0.8270000000000001,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39,0.0
4,27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,1289,city_103,0.92,Male,No relevent experience,no_enrollment,Graduate,Humanities,16,,Public Sector,4,15,0.0
2125,195,city_136,0.897,Male,Has relevent experience,no_enrollment,Masters,STEM,18,,,2,30,1.0
2126,31762,city_100,0.887,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,18,0.0
2127,7873,city_102,0.804,Male,Has relevent experience,Full time course,High School,,7,100-500,Public Sector,1,84,0.0


In [7]:
# Examine data
# Group by target
query_target = """
SELECT COUNT(*)
    FROM candidate
    GROUP BY target;
"""

target_count = pd_sql.read_sql(query_target, connection)
target_count.head()

Unnamed: 0,count
0,14381
1,4777


In [8]:
# Examine data
# Group by major
query_major = """
SELECT COUNT(major_discipline)
    FROM candidate
    GROUP BY major_discipline;
"""

major_count = pd_sql.read_sql(query_major, connection)
major_count

Unnamed: 0,count
0,14492
1,327
2,223
3,253
4,381
5,0
6,669


In [9]:
# Examine data
# Group by enrolled_university and filter by relevent_experience
query_experience = """
SELECT COUNT(major_discipline)
    FROM candidate
    WHERE relevent_experience = 'Has relevent experience'
    GROUP BY enrolled_university;
"""

experience = pd_sql.read_sql(query_experience, connection)
experience

Unnamed: 0,count
0,1243
1,703
2,158
3,10613
