In [None]:
import pandas as pd
import numpy as np

In [None]:
train_df = pd.read_csv('dataset/train_split_partially_preprocessed.csv')

In [None]:
test_df = pd.read_csv('dataset/test_split_partially_preprocessed.csv')

In [None]:
assert(len(train_df.columns) == len(test_df.columns))

In [None]:
full_df = pd.concat([train_df, test_df])

In [None]:
full_df.dtypes.value_counts()

In [None]:
full_df.INCWAGE_CPIU_2010.mean()

In [None]:
def printColsValues(df, dfName, cols):
    print(f"Values for {dfName} ({len(df)} entries)\n")
    for col in cols:
        print(f'Information for column {col}:')
        if(df[col].dtype == 'float64' or df[col].dtype == 'int64'):
            print(df[col].describe())
        elif(df[col].dtype == 'object'):
            valueCounts = df[col].value_counts()
            print(f'True: {valueCounts[1]}, False: {valueCounts[0]}, Null: {df[col].isnull().values.sum()}')
        elif(df[col].dtype == 'bool'):
            valueCounts = df[col].value_counts()
            print(f'True: {valueCounts[1]}, False: {valueCounts[0]}')
        else:
            raise Exception(f'Unknown type {df[col].dtype}')
        print()

In [None]:
full_df.columns

In [None]:
cols = [
    'AGE', 'INCWAGE_CPIU_2010', 'isFemale', 
    'isAmericanIndian', 'isAsian', 'isBlack', 'isPacificIslander', 'isWhite', 'isOtherRace',
    'hasHealthInsurance', 'hasPrivateHealthInsurance', 'hasPublicHealthInsurance',
    'isInSchool', 'isHispanic',
    'bornInUS', 'speaksEnglish', 'speaksOnlyEnglish', 'speaksEnglishWell',
    'isMarried', 'wasMarried', 'neverMarried',
    'sameSexMarriage', 'mixedRaceMarriage',
    'isSelfEmployed', 'isPrivateSector', 'isPublicSector', 'isUnpaidFamilyWorker',
    'noSchooling', 'maxGrade4',
       'maxGrade8', 'maxSomeHS', 'highSchoolDiploma', 'someCollege',
       'associatesDegree', 'bachelorsDegree', 'mastersDegree',
       'bachelorsPlusProfessionalDegree', 'doctoralDegree', 'has2ndDegree',
    
]

In [None]:
for col in cols:
    if(full_df[col].isna().values.any()):
        print(f'Column {col} has NaN values')

In [None]:
printColsValues(full_df, 'Full DF', cols)

In [None]:
printColsValues(train_df, 'Train split', cols)

In [None]:
printColsValues(test_df, 'Test split', cols)

In [None]:
df_0to25 = full_df[full_df['INCWAGE_CPIU_2010'] <= 25000]
printColsValues(df_0to25, 'Under 25k salary', cols)

In [None]:
df_25to50 = full_df[(full_df['INCWAGE_CPIU_2010'] > 25000) & (full_df['INCWAGE_CPIU_2010'] <= 50000)]
printColsValues(df_25to50, '25k to 50k salary', cols)

In [None]:
df_50to75 = full_df[(full_df['INCWAGE_CPIU_2010'] > 50000) & (full_df['INCWAGE_CPIU_2010'] <= 75000)]
printColsValues(df_50to75, '50k to 75k salary', cols)

In [None]:
df_over75 = full_df[(full_df['INCWAGE_CPIU_2010'] > 75000)]
printColsValues(df_over75, 'Over 75k salary', cols)

In [None]:
for col in cols:
    if full_df[col].dtype != 'bool':
        continue

    assert(not full_df[col].isna().values.any())

    print(f'Of the people for whom {col} is true, salary information is:')
    print(full_df[full_df[col]].INCWAGE_CPIU_2010.describe())
    print(f'Of the people for whom {col} is false, salary information is:')
    print(full_df[~full_df[col]].INCWAGE_CPIU_2010.describe())
    print("\n")

In [None]:
assert(len(df_0to25) + len(df_25to50) + len(df_50to75) + len(df_over75) == len(full_df))

for col in cols:
    if full_df[col].dtype != 'bool':
        continue

    assert(not full_df[col].isna().values.any())

    lower = df_0to25[col].values.sum()
    lowerMid = df_25to50[col].values.sum()
    upperMid = df_50to75[col].values.sum()
    upper = df_over75[col].values.sum()

    totalNum = full_df[col].values.sum()
    assert(totalNum == lower + lowerMid + upperMid + upper)

    print(f'Of the people for whom {col} is true ({totalNum} people), salary information is:')
    print(f"Number making <=25k: {lower}, number making 25-50k: {lowerMid}, number making 50-75k: {upperMid}, number making >75k: {upper}")
    print(f'Of the people for whom {col} is false ({len(full_df) - totalNum}), salary information is:')
    print(f"Number making <=25k: {len(df_0to25) - lower}, number making 25-50k: {len(df_25to50) - lowerMid}, number making 50-75k: {len(df_50to75) - upperMid}, number making >75k: {len(df_over75) - upper}")
    print("\n")