In [25]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [26]:
from IPython.display import display, HTML

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )
def tbl_report(tbl, cols=None, card=10):
    print("Table Shape", tbl.shape)
    dtypes = tbl.dtypes
    nulls = []
    uniques = []
    numuniques = []
    vcs = []
    for col in dtypes.index:
        n = tbl[col].isnull().sum()
        nulls.append(n)
        strdtcol = str(dtypes[col])
        #if strdtcol == 'object' or strdtcol[0:3] == 'int' or strdtcol[0:3] == 'int':
        #print(strdtcol)
        uniqs = tbl[col].unique()
        uniquenums = uniqs.shape[0]
        if uniquenums < card: # low cardinality
            valcounts = pd.value_counts(tbl[col], dropna=False)
            vc = "\n".join(["{}:{}".format(k,v) for k, v in valcounts.items()])
        else:
            vc='HC' # high cardinality
        uniques.append(uniqs)
        numuniques.append(uniquenums)
        vcs.append(vc)
    nullseries = pd.Series(nulls, index=dtypes.index)
    uniqueseries = pd.Series(uniques, index=dtypes.index)
    numuniqueseries = pd.Series(numuniques, index=dtypes.index)
    vcseries = pd.Series(vcs, index=dtypes.index)
    df = pd.concat([dtypes, nullseries, uniqueseries, numuniqueseries, vcseries], axis=1)
    df.columns = ['dtype', 'nulls', 'uniques', 'num_uniques', 'value_counts']
    if cols:
        return pretty_print(df[cols])
    return pretty_print(df)


In [44]:
train_data = pd.read_csv('Data/training.zip')
train_data.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [41]:
test_data = pd.read_csv('Data/test.zip')
test_data.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,350000,-999.0,79.589,23.916,3.036,-999.0,-999.0,-999.0,0.903,3.036,...,2.022,98.556,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0
1,350001,106.398,67.49,87.949,49.994,-999.0,-999.0,-999.0,2.048,2.679,...,-1.138,176.251,1,47.575,-0.553,-0.849,-999.0,-999.0,-999.0,47.575
2,350002,117.794,56.226,96.358,4.137,-999.0,-999.0,-999.0,2.755,4.137,...,-1.868,111.505,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
3,350003,135.861,30.604,97.288,9.104,-999.0,-999.0,-999.0,2.811,9.104,...,1.172,164.707,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,350004,74.159,82.772,58.731,89.646,1.347,536.663,-0.339,1.028,77.213,...,-0.231,869.614,3,254.085,-1.013,-0.334,185.857,0.335,2.587,599.213


In [42]:
train_data.shape

(250000, 33)

In [35]:
tbl_report(train_data, cols=['dtype', 'nulls', 'num_uniques', 'value_counts'])

Table Shape (250000, 33)


Unnamed: 0,dtype,nulls,num_uniques,value_counts
EventId,int64,0,250000,HC
DER_mass_MMC,float64,0,108338,HC
DER_mass_transverse_met_lep,float64,0,101637,HC
DER_mass_vis,float64,0,100558,HC
DER_pt_h,float64,0,115563,HC
DER_deltaeta_jet_jet,float64,0,7087,HC
DER_mass_jet_jet,float64,0,68366,HC
DER_prodeta_jet_jet,float64,0,16593,HC
DER_deltar_tau_lep,float64,0,4692,HC
DER_pt_tot,float64,0,59042,HC


So, there are no nulls.

In [43]:
X = train_data.drop(['Weight', 'Label'], axis = 1)
X.set_index('EventId', inplace = True)
Y = train_data['Label']

In [45]:
label = np.where(train_data['Label'] == 'b',1,0)
100*label.mean()

65.7332