In [181]:
import pandas as pd
import numpy as np

In [6]:
def load_data(filename):
    dtypes = {
        'MachineIdentifier':                                    'str',
        'ProductName':                                          'str',
        'EngineVersion':                                        'str',
        'AppVersion':                                           'str',
        'AvSigVersion':                                         'str',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float64',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'str',
        'Processor':                                            'str',
        'OsVer':                                                'str',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'str',
        'OsBuildLab':                                           'str',
        'SkuEdition':                                           'str',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'str',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'str',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'str',
        'Census_DeviceFamily':                                  'str',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'str',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'str',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'str',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'str',
        'Census_InternalBatteryType':                           'str',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'str',
        'Census_OSArchitecture':                                'str',
        'Census_OSBranch':                                      'str',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'str',
        'Census_OSSkuName':                                     'str',
        'Census_OSInstallTypeName':                             'str',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'str',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'str',
        'Census_ActivationChannel':                             'str',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'str',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

    df = pd.read_csv(filename, dtype=dtypes, engine='c')
    return df

In [362]:
def analyze(df):
    for col in df.columns:
        desc = df[col].describe()
        print("-- Name: {:}  Type: {:} Count: {:} \t Unique: {:} \t Missing: {:}".format(desc.name.ljust(50), str(desc.dtype).ljust(10), df[col].count(), df[col].nunique(), df[col].hasnans))
        if df[col].hasnans:
                print("---- has missing values")
        #
        # look at all strings
        #
        if df[col].dtype.name == 'object':
            #if df[col].hasnans:
            #    df[col].fillna("", inplace=True)
                
            if len(df[col].str.lower().unique()) != len(df[col].unique()):
                print("---- has strings that differ only in case")
                
            #
            # look for HTML escape characters (&#x..;)
            # and unicode characters (searching for: anything not printable)
            #
            
            df_bad = df[col][df[col].str.contains(r'[\x00-\x1f]|&#x\d\d;', regex=True, na=True)]
            if len(df_bad)>0:
                print("---- illegal characters:",len(df_bad))
            #
            # find different capitalizations of "unknown"
            # if more than one present, need to read as string, turn to lowercase, then make categorical
            #
            df_unknown = df[col].str.lower() == 'unknown'
            unknowns = df[col][df_unknown].unique()
            if len(unknowns) > 1:
                print("----",unknowns)
            df[col] = df[col].astype('category')
            print("---- top 50 values:")
            values = df[col].dtype.categories.get_values()
            max_len = df[col].nunique()
            if max_len > 50:
                max_len = 50
            print(df[col].dtype.categories.get_values()[:max_len])
            

            

In [363]:
df = load_data("data/mini_initial.csv")
analyze(df)

-- Name: MachineIdentifier                                   Type: object     Count: 66911 	 Unique: 66911 	 Missing: False
---- top 50 values:
['0002b528f6cc0790138f08473045a66d' '00050f9acf73efc5b8453420fc9fdea1'
 '00057be2a0e1d230d3d2ea362f32a91a' '00057e7a01af082a9a7875dbdeecc3f9'
 '00061a248425b6c1a175be850577bc0e' '00066248c83d8287601ff5e8bff2a572'
 '000757f5602ee68e3419d3bfc4be808a' '00077f9e70a3040fc6e63e5d41370f1b'
 '00098e45b04535033a9b2611fcb5f2bc' '00099c7336e50e5569d037f52d26ff9f'
 '000b1d01ab90abe326bf26f8808b7a18' '000b4d6ceb069af6c06d07998727ff7e'
 '000bc62fbf3df38d9d5f21baeb3d403a' '000c42edf4127b5ac5bab84db4100554'
 '000d40a9577fa13cd8a8c7818d23d0c4' '000daf61fe3213b783a0658c151b7375'
 '00105f3d1faeb9c8df51171150a38131' '001082959e16f15139321a36272fffeb'
 '001115035b2c0f3e9402cf2179051592' '001165099cfa34005432f88c0b9c76a3'
 '001243199b2f0ec2a4d61c0ca6d93a54' '0013e9f371d7739b116a8b10bcd63697'
 '0013f1cb818e31a624a1f1e0dc5e2a08' '0014a7de215eb60b9ddc755bb5b8218b'
 '00

In [None]:
df = load_data("data/train.csv")
analyze(df)


In [None]:
df = load_data("data/test.csv")
analyze(df)