In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 10, 4

Populating the interactive namespace from numpy and matplotlib


## Train dataset

In [2]:
train = pd.read_csv('../data/train.csv')
print(train.columns)
print('Unique locations: %d' % len(train['location'].unique()))
print('train dataset size: %d' % len(train))
train.head()

Index(['id', 'location', 'fault_severity'], dtype='object')
Unique locations: 929
train dataset size: 7381


Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


In [6]:
event_type = pd.read_csv('../data/event_type.csv')
print('Unique types: %d' % len(event_type['event_type'].unique()))
print(event_type['event_type'].unique())
print('event_type dataset size: %d' % len(event_type))
sparse_events = pd.get_dummies(event_type).groupby('id').sum().reset_index()
sparse_events.head()

Unique types: 53
['event_type 11' 'event_type 15' 'event_type 20' 'event_type 7'
 'event_type 34' 'event_type 35' 'event_type 9' 'event_type 2'
 'event_type 54' 'event_type 1' 'event_type 6' 'event_type 30'
 'event_type 29' 'event_type 13' 'event_type 22' 'event_type 23'
 'event_type 18' 'event_type 14' 'event_type 32' 'event_type 3'
 'event_type 5' 'event_type 28' 'event_type 27' 'event_type 10'
 'event_type 21' 'event_type 31' 'event_type 38' 'event_type 24'
 'event_type 40' 'event_type 8' 'event_type 36' 'event_type 12'
 'event_type 39' 'event_type 19' 'event_type 33' 'event_type 37'
 'event_type 25' 'event_type 17' 'event_type 41' 'event_type 50'
 'event_type 46' 'event_type 47' 'event_type 49' 'event_type 53'
 'event_type 44' 'event_type 42' 'event_type 45' 'event_type 26'
 'event_type 4' 'event_type 43' 'event_type 52' 'event_type 51'
 'event_type 48']
event_type dataset size: 31170


Unnamed: 0,id,event_type_event_type 1,event_type_event_type 10,event_type_event_type 11,event_type_event_type 12,event_type_event_type 13,event_type_event_type 14,event_type_event_type 15,event_type_event_type 17,event_type_event_type 18,...,event_type_event_type 5,event_type_event_type 50,event_type_event_type 51,event_type_event_type 52,event_type_event_type 53,event_type_event_type 54,event_type_event_type 6,event_type_event_type 7,event_type_event_type 8,event_type_event_type 9
0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
resource_type = pd.read_csv('../data/resource_type.csv')
print('resource_type dataset size: %d' % len(resource_type))
print('Unique types: %d' % len(resource_type['resource_type'].unique()))
print(resource_type['resource_type'].unique())
sparse_resource = pd.get_dummies(resource_type).groupby('id').sum().reset_index()
sparse_resource.head()

resource_type dataset size: 21076
Unique types: 10
['resource_type 8' 'resource_type 2' 'resource_type 1' 'resource_type 9'
 'resource_type 6' 'resource_type 7' 'resource_type 10' 'resource_type 4'
 'resource_type 3' 'resource_type 5']


Unnamed: 0,id,resource_type_resource_type 1,resource_type_resource_type 10,resource_type_resource_type 2,resource_type_resource_type 3,resource_type_resource_type 4,resource_type_resource_type 5,resource_type_resource_type 6,resource_type_resource_type 7,resource_type_resource_type 8,resource_type_resource_type 9
0,1,0,0,0,0,0,0,1,0,1,0
1,2,0,0,1,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,1,0
3,4,0,0,1,0,0,0,0,0,0,0
4,5,0,0,1,0,0,0,0,0,0,0


In [8]:
severity_type = pd.read_csv('../data/severity_type.csv')
print('severity_type dataset size: %d' % len(severity_type))
print('Unique types: %d' % len(severity_type['severity_type'].unique()))
print(severity_type['severity_type'].unique())
sparse_severity = pd.get_dummies(severity_type).groupby('id').sum().reset_index()
sparse_severity.head()

severity_type dataset size: 18552
Unique types: 5
['severity_type 2' 'severity_type 1' 'severity_type 4' 'severity_type 5'
 'severity_type 3']


Unnamed: 0,id,severity_type_severity_type 1,severity_type_severity_type 2,severity_type_severity_type 3,severity_type_severity_type 4,severity_type_severity_type 5
0,1,1,0,0,0,0
1,2,0,1,0,0,0
2,3,1,0,0,0,0
3,4,0,0,0,1,0
4,5,0,1,0,0,0


## Join all dataframes

In [12]:
df_train = train.merge(sparse_feature, on='id')
df_train = df_train.merge(sparse_events, on='id')
df_train = df_train.merge(sparse_resource, on='id')
df_train = df_train.merge(sparse_severity, on='id')
print('Train count: %d' % len(train))
print('DataFrame shape: %d, %d' % (len(df_train), len(df_train.columns)))
df_train.head()

Train count: 7381
DataFrame shape: 7381, 457


Unnamed: 0,id,location,fault_severity,feature 1,feature 10,feature 100,feature 101,feature 102,feature 103,feature 104,...,resource_type_resource_type 5,resource_type_resource_type 6,resource_type_resource_type 7,resource_type_resource_type 8,resource_type_resource_type 9,severity_type_severity_type 1,severity_type_severity_type 2,severity_type_severity_type 3,severity_type_severity_type 4,severity_type_severity_type 5
0,14121,location 118,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,9320,location 91,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,14394,location 152,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,8218,location 931,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,14804,location 120,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [11]:
df_test = test.merge(sparse_severity, on='id')
print('Test count: %d' % len(test))
print('DataFrame shape: %d, %d' % (len(df_test), len(df_test.columns)))
df_test.head()

Test count: 11171
DataFrame shape: 11171, 7


Unnamed: 0,id,location,severity_type_severity_type 1,severity_type_severity_type 2,severity_type_severity_type 3,severity_type_severity_type 4,severity_type_severity_type 5
0,11066,location 481,0,1,0,0,0
1,18000,location 962,1,0,0,0,0
2,16964,location 491,0,1,0,0,0
3,4795,location 532,0,0,0,0,1
4,3392,location 600,0,1,0,0,0


## Bayes models

In [15]:
df2 = np.ceil(df_train)
df2.groupby('fault_severity').sum()

AttributeError: 'int' object has no attribute 'ceil'