In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../Data/HRData.csv')

In [3]:
df.axes

[RangeIndex(start=0, stop=311, step=1),
 Index(['Employee_Name', 'EmpID', 'MarriedID', 'MaritalStatusID', 'GenderID',
        'EmpStatusID', 'DeptID', 'PerfScoreID', 'FromDiversityJobFairID',
        'Salary', 'Termd', 'PositionID', 'Position', 'State', 'Zip', 'DOB',
        'Sex', 'MaritalDesc', 'CitizenDesc', 'HispanicLatino', 'RaceDesc',
        'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus',
        'Department', 'ManagerName', 'ManagerID', 'RecruitmentSource',
        'PerformanceScore', 'EngagementSurvey', 'EmpSatisfaction',
        'SpecialProjectsCount', 'LastPerformanceReview_Date', 'DaysLateLast30',
        'Absences'],
       dtype='object')]

In [4]:
df.shape

(311, 36)

In [5]:
# Short and concise information about the dataframe
df.info(verbose = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Columns: 36 entries, Employee_Name to Absences
dtypes: float64(2), int64(16), object(18)
memory usage: 87.6+ KB


### Projecting one or more columns of a DataFrame

In [6]:
# this generates a Series  - same index as the original df
df['RecruitmentSource']

0               LinkedIn
1                 Indeed
2               LinkedIn
3                 Indeed
4          Google Search
             ...        
306             LinkedIn
307        Google Search
308    Employee Referral
309    Employee Referral
310             LinkedIn
Name: RecruitmentSource, Length: 311, dtype: object

In [7]:
# this generates a dataframe with a single column - same index as the original df
df[['RecruitmentSource']]

Unnamed: 0,RecruitmentSource
0,LinkedIn
1,Indeed
2,LinkedIn
3,Indeed
4,Google Search
...,...
306,LinkedIn
307,Google Search
308,Employee Referral
309,Employee Referral


In [8]:
# this generates a dataframe with two columns - same index as the original df
df[['Department','RecruitmentSource']]

Unnamed: 0,Department,RecruitmentSource
0,Production,LinkedIn
1,IT/IS,Indeed
2,Production,LinkedIn
3,Production,Indeed
4,Production,Google Search
...,...,...
306,Production,LinkedIn
307,Production,Google Search
308,IT/IS,Employee Referral
309,IT/IS,Employee Referral


### Attributes and Methods of a column projected as a Series

In [9]:
# this generates a list (ndarray) of values in a column
df['RecruitmentSource'].values

array(['LinkedIn', 'Indeed', 'LinkedIn', 'Indeed', 'Google Search',
       'LinkedIn', 'LinkedIn', 'Employee Referral', 'Diversity Job Fair',
       'Indeed', 'Diversity Job Fair', 'Diversity Job Fair',
       'Diversity Job Fair', 'Google Search', 'On-line Web application',
       'Google Search', 'Employee Referral', 'Google Search',
       'Google Search', 'LinkedIn', 'Google Search', 'Indeed', 'Indeed',
       'CareerBuilder', 'Google Search', 'LinkedIn', 'Diversity Job Fair',
       'Indeed', 'Google Search', 'Diversity Job Fair', 'Google Search',
       'Diversity Job Fair', 'Google Search', 'Employee Referral',
       'Indeed', 'Google Search', 'Indeed', 'Indeed', 'LinkedIn',
       'LinkedIn', 'Indeed', 'Google Search', 'Indeed', 'Indeed',
       'LinkedIn', 'Employee Referral', 'Indeed', 'Indeed', 'Indeed',
       'Google Search', 'Indeed', 'Employee Referral',
       'Employee Referral', 'LinkedIn', 'CareerBuilder', 'Indeed',
       'Indeed', 'Indeed', 'LinkedIn', 'Employee R

In [10]:
# this generates a list (ndarray) of unique values in a column
df['RecruitmentSource'].unique()

array(['LinkedIn', 'Indeed', 'Google Search', 'Employee Referral',
       'Diversity Job Fair', 'On-line Web application', 'CareerBuilder',
       'Website', 'Other'], dtype=object)

In [11]:
# this generates number of unique values in the dataframe
df['RecruitmentSource'].nunique()

9

In [12]:
# this generates a Series with record counts grouped by the column 
# Row Index has unique values and Value axis has counts
# NOTE: The output does not count NaN's and the output is sorted descending by the values.
df['ManagerID'].value_counts()

18.0    22
20.0    22
16.0    22
12.0    22
19.0    21
14.0    21
22.0    21
11.0    21
2.0     19
4.0     17
7.0     14
17.0    14
39.0    13
21.0    13
10.0     9
13.0     8
5.0      7
1.0      6
6.0      4
15.0     3
9.0      2
3.0      1
30.0     1
Name: ManagerID, dtype: int64

In [13]:
# If NaN's are to be counted and the output is to be sorted by the grouping key (meaning the Index here)
df['ManagerID'].value_counts(dropna = False).sort_index(na_position = 'first')

NaN      8
1.0      6
2.0     19
3.0      1
4.0     17
5.0      7
6.0      4
7.0     14
9.0      2
10.0     9
11.0    21
12.0    22
13.0     8
14.0    21
15.0     3
16.0    22
17.0    14
18.0    22
19.0    21
20.0    22
21.0    13
22.0    21
30.0     1
39.0    13
Name: ManagerID, dtype: int64