In [1]:
# import packages
import numpy as np
import pandas as pd


In [20]:
# Read the audit dataset into Python
df = pd.read_csv("audit.csv")
df

Unnamed: 0,ID,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted
0,1004641,38,Private,College,Unmarried,Service,81838.00,Female,0.0,72,0,0
1,1010229,35,Private,Associate,Absent,Transport,72099.00,Male,0.0,30,0,0
2,1024587,32,Private,HSgrad,Divorced,Clerical,154676.74,Male,0.0,40,0,0
3,1038288,45,Private,Bachelor,Married,Repair,27743.82,Male,0.0,55,7298,1
4,1044221,60,Private,College,Married,Executive,7568.23,Male,0.0,40,15024,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,9957280,62,Private,HSgrad,Married,Repair,24080.59,Male,0.0,40,0,0
1996,9964393,35,Consultant,Associate,Married,Repair,57497.30,Male,0.0,40,0,0
1997,9972967,32,Private,Bachelor,Married,Sales,30538.18,Male,0.0,44,0,0
1998,9991103,34,Private,College,Unmarried,Sales,113425.67,Male,0.0,45,0,0


In [3]:
# Filtering
# Subsetting rows
# Displaying Row numbers 1,2,8,456
df.iloc[[1, 2, 8, 456]]


Unnamed: 0,ID,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted
1,1010229,35,Private,Associate,Absent,Transport,72099.0,Male,0.0,30,0,0
2,1024587,32,Private,HSgrad,Divorced,Clerical,154676.74,Male,0.0,40,0,0
8,1061323,25,Private,Associate,Divorced,Clerical,126888.91,Female,0.0,40,0,0
456,3274547,40,Private,Vocational,Absent,Clerical,82953.38,Female,0.0,40,0,0


In [4]:
# Subsetting columns
# Display Education and Marital Status
df[["Education", "Marital"]]

Unnamed: 0,Education,Marital
0,College,Unmarried
1,Associate,Absent
2,HSgrad,Divorced
3,Bachelor,Married
4,College,Married
...,...,...
1995,HSgrad,Married
1996,Associate,Married
1997,Bachelor,Married
1998,College,Unmarried


In [5]:
# Subsetting rows and columns
#  Display 2nd row and 4th column
df.iloc[:, [1, 3]]


Unnamed: 0,Age,Education
0,38,College
1,35,Associate
2,32,HSgrad
3,45,Bachelor
4,60,College
...,...,...
1995,62,HSgrad
1996,35,Associate
1997,32,Bachelor
1998,34,College


In [6]:
# Displaying Row numbers 1,2,8,456 and Column numbers 1,3,6
df.iloc[[1, 2, 8, 456], [1, 3, 6]]


Unnamed: 0,Age,Education,Income
1,35,Associate,72099.0
2,32,HSgrad,154676.74
8,25,Associate,126888.91
456,40,Vocational,82953.38


In [7]:
# We can simply extract columns from a dataframe
# Displaying first 5 rows of column "Education"
df["Education"].head()


0      College
1    Associate
2       HSgrad
3     Bachelor
4      College
Name: Education, dtype: object

In [8]:
# Display all rows and columns for male private employed employees
df.query(""" Gender == "Male" and Occupation == "Service" """)

Unnamed: 0,ID,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted
5,1047095,74,Private,HSgrad,Married,Service,33144.40,Male,0.0,30,0,0
22,1126025,30,Private,HSgrad,Absent,Service,88125.97,Male,0.0,30,0,0
41,1172752,77,Private,HSgrad,Married,Service,39950.92,Male,0.0,25,0,0
74,1310818,33,Private,Yr11,Absent,Service,32550.53,Male,0.0,40,0,0
91,1398832,42,Private,Vocational,Married,Service,22787.48,Male,0.0,30,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1871,9490694,22,Private,HSgrad,Absent,Service,116599.30,Male,0.0,80,0,0
1887,9538363,37,Private,Yr1t4,Married,Service,11180.76,Male,0.0,53,0,0
1889,9540165,20,Private,HSgrad,Absent,Service,156308.00,Male,0.0,50,0,0
1940,9747618,46,Private,Yr10,Married-spouse-absent,Service,82756.91,Male,0.0,37,0,0


In [9]:
# Get only the columns Age and Marital status of male private employees
df.query(""" Occupation == "Service" """)[["Age", "Marital"]]


Unnamed: 0,Age,Marital
0,38,Unmarried
5,74,Married
10,48,Divorced
12,21,Absent
22,30,Absent
...,...,...
1940,46,Married-spouse-absent
1974,48,Absent
1979,17,Absent
1986,35,Divorced


In [10]:
# Get all the columns except Age and Marital status of male private employees
df.query(""" Gender == "Male" and Occupation == "Service" """).drop(["Age", "Marital"], axis=1)

Unnamed: 0,ID,Employment,Education,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted
5,1047095,Private,HSgrad,Service,33144.40,Male,0.0,30,0,0
22,1126025,Private,HSgrad,Service,88125.97,Male,0.0,30,0,0
41,1172752,Private,HSgrad,Service,39950.92,Male,0.0,25,0,0
74,1310818,Private,Yr11,Service,32550.53,Male,0.0,40,0,0
91,1398832,Private,Vocational,Service,22787.48,Male,0.0,30,0,0
...,...,...,...,...,...,...,...,...,...,...
1871,9490694,Private,HSgrad,Service,116599.30,Male,0.0,80,0,0
1887,9538363,Private,Yr1t4,Service,11180.76,Male,0.0,53,0,0
1889,9540165,Private,HSgrad,Service,156308.00,Male,0.0,50,0,0
1940,9747618,Private,Yr10,Service,82756.91,Male,0.0,37,0,0


In [11]:
# Adding new columns
# Create a new column LogIncome
df["LogIncome"] = np.log(df["Income"])
df

Unnamed: 0,ID,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted,LogIncome
0,1004641,38,Private,College,Unmarried,Service,81838.00,Female,0.0,72,0,0,11.312497
1,1010229,35,Private,Associate,Absent,Transport,72099.00,Male,0.0,30,0,0,11.185795
2,1024587,32,Private,HSgrad,Divorced,Clerical,154676.74,Male,0.0,40,0,0,11.949093
3,1038288,45,Private,Bachelor,Married,Repair,27743.82,Male,0.0,55,7298,1,10.230768
4,1044221,60,Private,College,Married,Executive,7568.23,Male,0.0,40,15024,1,8.931715
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,9957280,62,Private,HSgrad,Married,Repair,24080.59,Male,0.0,40,0,0,10.089161
1996,9964393,35,Consultant,Associate,Married,Repair,57497.30,Male,0.0,40,0,0,10.959493
1997,9972967,32,Private,Bachelor,Married,Sales,30538.18,Male,0.0,44,0,0,10.326733
1998,9991103,34,Private,College,Unmarried,Sales,113425.67,Male,0.0,45,0,0,11.638903


In [12]:
# Sorting data
# Sort the data in ascending order of income
df.sort_values(by="Income")

Unnamed: 0,ID,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted,LogIncome
298,2474583,31,Private,College,Absent,Executive,609.72,Male,0.0,40,0,0,6.413000
27,1141887,32,Consultant,HSgrad,Married,Sales,1428.27,Male,0.0,60,28235,1,7.264219
1665,8508526,79,Private,Bachelor,Married,Sales,1598.95,Male,0.0,40,0,0,7.377102
1962,9844729,21,Private,College,Absent,Professional,2164.59,Female,0.0,40,0,0,7.679986
706,4396587,24,Private,HSgrad,Absent,Sales,2378.63,Male,0.0,38,0,0,7.774280
...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,2424026,40,Private,College,Married,Professional,385067.54,Female,0.0,40,0,0,12.861174
726,4477040,27,Private,Professional,Married,Professional,393393.42,Female,0.0,50,791,1,12.882565
1089,6078926,44,Private,College,Married,Clerical,411783.25,Female,2415.0,6,3636,1,12.928252
1572,8180215,39,Private,Associate,Married,Clerical,421075.30,Female,0.0,25,0,1,12.950567


In [13]:
# Sort the data in descending order of income
df.sort_values(by="Income", ascending=False)

Unnamed: 0,ID,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,RISK_Adjustment,TARGET_Adjusted,LogIncome
585,3835253,28,Private,Bachelor,Married,Executive,481259.50,Female,0.0,40,6264,1,13.084162
1572,8180215,39,Private,Associate,Married,Clerical,421075.30,Female,0.0,25,0,1,12.950567
1089,6078926,44,Private,College,Married,Clerical,411783.25,Female,2415.0,6,3636,1,12.928252
726,4477040,27,Private,Professional,Married,Professional,393393.42,Female,0.0,50,791,1,12.882565
288,2424026,40,Private,College,Married,Professional,385067.54,Female,0.0,40,0,0,12.861174
...,...,...,...,...,...,...,...,...,...,...,...,...,...
706,4396587,24,Private,HSgrad,Absent,Sales,2378.63,Male,0.0,38,0,0,7.774280
1962,9844729,21,Private,College,Absent,Professional,2164.59,Female,0.0,40,0,0,7.679986
1665,8508526,79,Private,Bachelor,Married,Sales,1598.95,Male,0.0,40,0,0,7.377102
27,1141887,32,Consultant,HSgrad,Married,Sales,1428.27,Male,0.0,60,28235,1,7.264219


In [14]:
# Group by and Summarize Operations
# What is the average income by Gender and Marital Status?

df.groupby(["Gender", "Marital"]).agg({"Income": np.mean}).reset_index()

Unnamed: 0,Gender,Marital,Income
0,Female,Absent,107767.416431
1,Female,Divorced,132759.393439
2,Female,Married,198840.887766
3,Female,Married-spouse-absent,114691.639
4,Female,Unmarried,128970.392368
5,Female,Widowed,92494.5312
6,Male,Absent,112824.620155
7,Male,Divorced,96208.459083
8,Male,Married,35741.630109
9,Male,Married-spouse-absent,76552.35


In [15]:
#What are the minimum, maximum of the average income by Gender and Marital Status?
df.groupby(["Gender", "Marital"]).agg({"Income": [np.min, np.max, np.mean]}).reset_index()

Unnamed: 0_level_0,Gender,Marital,Income,Income,Income
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amin,amax,mean
0,Female,Absent,2164.59,342939.73,107767.416431
1,Female,Divorced,9034.01,345263.49,132759.393439
2,Female,Married,21551.74,481259.5,198840.887766
3,Female,Married-spouse-absent,51958.85,245512.73,114691.639
4,Female,Unmarried,23381.42,267843.59,128970.392368
5,Female,Widowed,5226.03,242869.91,92494.5312
6,Male,Absent,609.72,335039.49,112824.620155
7,Male,Divorced,2382.43,275897.1,96208.459083
8,Male,Married,1428.27,192398.74,35741.630109
9,Male,Married-spouse-absent,21917.9,218597.45,76552.35


In [16]:
# finds mean of all numeric variables for each category of Gender
df.groupby(["Gender"])["Age", "Income", "Deductions",
                       "Hours", "LogIncome"].agg(np.mean).reset_index()


  df.groupby(["Gender"])["Age", "Income", "Deductions", "Hours", "LogIncome"].agg(np.mean).reset_index()


Unnamed: 0,Gender,Age,Income,Deductions,Hours,LogIncome
0,Female,37.30538,127697.72932,29.625,35.802215,11.546
1,Male,39.230263,64818.680673,85.09771,42.048246,10.772202


In [17]:
# Contingency Table between Gender and Marital Status - hint: pd.crosstab()
pd.crosstab(df["Gender"], df["Marital"])


Marital,Absent,Divorced,Married,Married-spouse-absent,Unmarried,Widowed
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,283,157,94,10,38,50
Male,386,109,823,12,29,9
