In [1]:
import os

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display, display_html , HTML
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
train.name = 'train'
verify = pd.read_csv('val.csv')
verify.name = 'verify'
test = pd.read_csv('test.csv')
test.name = 'test'

datasets = [train,verify,test]
pd.set_option('display.max_row',max(train.shape[0],verify.shape[0],test.shape[0]))
pd.set_option('display.max_column',max(train.shape[1],verify.shape[1],test.shape[1]))
for df in datasets:
    print ("The dataset",df.name,"has",df.shape[0],"rows and",df.shape[1],"columns")
    
display(HTML('<h1>train</h1>'))
display(train.head())
display(HTML('<h1>verify</h1>'))
display(verify.head())
display(HTML('<h1>test</h1>'))
display(test.head())

In [4]:
#Visualizing datasets dtypes 1 by 1

train_dtypes = pd.DataFrame(np.transpose(np.array((train.columns,train.dtypes),dtype=object,)),columns=['features','dtype'])
verify_dtypes = pd.DataFrame(np.transpose(np.array((verify.columns,verify.dtypes),dtype=object,)),columns=['features','dtype'])
test_dtypes = pd.DataFrame(np.transpose(np.array((test.columns,test.dtypes),dtype=object,)),columns=['features','dtype'])


train_dtypes = train_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
verify_dtypes = verify_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
test_dtypes = test_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])


train_dtypes_styler = train_dtypes.set_table_attributes("style='display:inline'").set_caption('train')
verify_dtypes_styler = verify_dtypes.set_table_attributes("style='display:inline'").set_caption('verify')
test_dtypes_styler = test_dtypes.set_table_attributes("style='display:inline'").set_caption('test')
space = "\xa0" * 50
display_html(train_dtypes_styler._repr_html_() + space + verify_dtypes_styler._repr_html_() + space + 
             test_dtypes_styler._repr_html_(), raw=True)

In [5]:
display(HTML('<h1><center>Missing values of the different tables (%)</center></h1>'))

a = pd.DataFrame(np.transpose(np.array((train.columns,round(train.isna().sum()/train.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
b = pd.DataFrame(np.transpose(np.array((verify.columns,round(verify.isna().sum()/verify.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
c = pd.DataFrame(np.transpose(np.array((test.columns,round(test.isna().sum()/test.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])

def highlight_greaterthan(x):
    if x.missing_rate > 80:
        return ['background-color: #FFCECE']*2
    if x.missing_rate > 40:
        return ['background-color: #FFE9CE']*2
    if x.missing_rate > 5:
        return ['background-color: #FFFECE']*2
    else:
        return ['background-color: #CEFFFC']*2
    
a = a.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
b = b.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
c = c.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])

a_styler = a.set_table_attributes("style='display:inline'").set_caption('train')
b_styler = b.set_table_attributes("style='display:inline'").set_caption('verify')
c_styler = c.set_table_attributes("style='display:inline'").set_caption('test')

space = "\xa0" * 50
display_html(a_styler._repr_html_() + space + b_styler._repr_html_() + space + c_styler._repr_html_(), raw=True)

display(HTML('<h3><i>The values highlighted are the ones above a certain threshold of missing values</i></h3>'))
display(HTML('<h3><i>We will get rid of those for the rest of the notebook</i></h3>'))

In [6]:
# Analyzing training data
for col in train.select_dtypes("object"):
    print('\n')
    print('Number of values in "',col,'"', {train[col].nunique()})
    print(train[col].unique())
    print('\n')
    print('------------------------------------------------')

sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 30))
plt.title('Number of accidents in 2019 per Department in training set')
sns.countplot(y=train['dep'])
plt.xlabel("Number of accidents")
plt.ylabel("Department")
plt.show()

sns.set(font_scale = 1.5)
fig, ax = plt.subplots(10,5, figsize=(30, 60))
i=0
for col in train.select_dtypes(include=['float64','int64']):
    sns.distplot(train[col],label=col,ax=ax[i//5][i%5])
    i=i+1
fig.show()