In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [55]:
twisterdf=pd.read_csv('1950-2017_actual_tornadoes.csv')
twisterdf.head()

Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,len,wid,ns,sn,sg,f1,f2,f3,f4,fc
0,1,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,...,9.5,150,2,0,1,0,0,0,0,0
1,2,1950,1,3,1950-01-03,11:55:00,3,IL,17,2,...,3.6,130,1,1,1,135,0,0,0,0
2,3,1950,1,3,1950-01-03,16:00:00,3,OH,39,1,...,0.1,10,1,1,1,161,0,0,0,0
3,4,1950,1,13,1950-01-13,05:25:00,3,AR,5,1,...,0.6,17,1,1,1,113,0,0,0,0
4,5,1950,1,25,1950-01-25,19:30:00,3,MO,29,2,...,2.3,300,1,1,1,93,0,0,0,0


In [56]:
twisterdf.isnull().sum()

om       0
yr       0
mo       0
dy       0
date     0
time     0
tz       0
st       0
stf      0
stn      0
mag      0
inj      0
fat      0
loss     0
closs    0
slat     0
slon     0
elat     0
elon     0
len      0
wid      0
ns       0
sn       0
sg       0
f1       0
f2       0
f3       0
f4       0
fc       0
dtype: int64

In [57]:
modeldf = pd.get_dummies(data=twisterdf, columns=['st'])

In [58]:
modeldf.head()

Unnamed: 0,om,yr,mo,dy,date,time,tz,stf,stn,mag,...,st_SD,st_TN,st_TX,st_UT,st_VA,st_VT,st_WA,st_WI,st_WV,st_WY
0,1,1950,1,3,1950-01-03,11:00:00,3,29,1,3,...,0,0,0,0,0,0,0,0,0,0
1,2,1950,1,3,1950-01-03,11:55:00,3,17,2,3,...,0,0,0,0,0,0,0,0,0,0
2,3,1950,1,3,1950-01-03,16:00:00,3,39,1,1,...,0,0,0,0,0,0,0,0,0,0
3,4,1950,1,13,1950-01-13,05:25:00,3,5,1,3,...,0,0,0,0,0,0,0,0,0,0
4,5,1950,1,25,1950-01-25,19:30:00,3,29,2,2,...,0,0,0,0,0,0,0,0,0,0


In [59]:
modeldf = modeldf.drop(['date','time'], axis=1)

In [60]:
modeldf.dtypes

om         int64
yr         int64
mo         int64
dy         int64
tz         int64
stf        int64
stn        int64
mag        int64
inj        int64
fat        int64
loss     float64
closs    float64
slat     float64
slon     float64
elat     float64
elon     float64
len      float64
wid        int64
ns         int64
sn         int64
sg         int64
f1         int64
f2         int64
f3         int64
f4         int64
fc         int64
st_AK      uint8
st_AL      uint8
st_AR      uint8
st_AZ      uint8
          ...   
st_MI      uint8
st_MN      uint8
st_MO      uint8
st_MS      uint8
st_MT      uint8
st_NC      uint8
st_ND      uint8
st_NE      uint8
st_NH      uint8
st_NJ      uint8
st_NM      uint8
st_NV      uint8
st_NY      uint8
st_OH      uint8
st_OK      uint8
st_OR      uint8
st_PA      uint8
st_PR      uint8
st_RI      uint8
st_SC      uint8
st_SD      uint8
st_TN      uint8
st_TX      uint8
st_UT      uint8
st_VA      uint8
st_VT      uint8
st_WA      uint8
st_WI      uin

In [61]:
modeldf.head()

Unnamed: 0,om,yr,mo,dy,tz,stf,stn,mag,inj,fat,...,st_SD,st_TN,st_TX,st_UT,st_VA,st_VT,st_WA,st_WI,st_WV,st_WY
0,1,1950,1,3,3,29,1,3,3,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1950,1,3,3,17,2,3,3,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1950,1,3,3,39,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1950,1,13,3,5,1,3,1,1,...,0,0,0,0,0,0,0,0,0,0
4,5,1950,1,25,3,29,2,2,5,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
modeldf=modeldf.loc[modeldf['mag'] >= 0] 

In [63]:
y = modeldf['mag']

In [64]:
X = modeldf.drop(['mag'], axis=1)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [66]:
tree = tree.DecisionTreeClassifier()

In [67]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [68]:
tree.score(X_train, y_train)

1.0

In [69]:
y_predict = tree.predict(X_test)

In [70]:
tree.score(X_test, y_test)

0.6250396447827465

In [72]:
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3', 'Predicted 4', 'Predicted 5'],
    index=['True 0', 'True 1', 'True 2', 'True 3', 'True 4', 'True 5']
)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
True 0,4551,1098,163,23,0,1
True 1,1062,2399,730,111,9,0
True 2,180,723,753,184,20,1
True 3,12,98,174,133,54,1
True 4,0,3,21,38,41,13
True 5,0,0,0,3,7,6


In [73]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      5836
           1       0.56      0.56      0.56      4311
           2       0.41      0.40      0.41      1861
           3       0.27      0.28      0.28       472
           4       0.31      0.35      0.33       116
           5       0.27      0.38      0.32        16

   micro avg       0.63      0.63      0.63     12612
   macro avg       0.43      0.46      0.44     12612
weighted avg       0.63      0.63      0.63     12612



In [74]:
modeldf.columns

Index(['om', 'yr', 'mo', 'dy', 'tz', 'stf', 'stn', 'mag', 'inj', 'fat', 'loss',
       'closs', 'slat', 'slon', 'elat', 'elon', 'len', 'wid', 'ns', 'sn', 'sg',
       'f1', 'f2', 'f3', 'f4', 'fc', 'st_AK', 'st_AL', 'st_AR', 'st_AZ',
       'st_CA', 'st_CO', 'st_CT', 'st_DC', 'st_DE', 'st_FL', 'st_GA', 'st_HI',
       'st_IA', 'st_ID', 'st_IL', 'st_IN', 'st_KS', 'st_KY', 'st_LA', 'st_MA',
       'st_MD', 'st_ME', 'st_MI', 'st_MN', 'st_MO', 'st_MS', 'st_MT', 'st_NC',
       'st_ND', 'st_NE', 'st_NH', 'st_NJ', 'st_NM', 'st_NV', 'st_NY', 'st_OH',
       'st_OK', 'st_OR', 'st_PA', 'st_PR', 'st_RI', 'st_SC', 'st_SD', 'st_TN',
       'st_TX', 'st_UT', 'st_VA', 'st_VT', 'st_WA', 'st_WI', 'st_WV', 'st_WY'],
      dtype='object')

In [75]:
modeldf = modeldf.drop(['om','f1','f2','f3','f4','fc','ns','sn','sg','tz'], axis=1)

In [76]:
y = modeldf['mag']

In [77]:
X = modeldf.drop(['mag'], axis=1)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [79]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [80]:
tree.score(X_train, y_train)

1.0

In [81]:
y_predict = tree.predict(X_test)

In [82]:
tree.score(X_test, y_test)

0.6230574056454171

In [83]:
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3', 'Predicted 4', 'Predicted 5'],
    index=['True 0', 'True 1', 'True 2', 'True 3', 'True 4', 'True 5']
)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
True 0,4568,1078,165,25,0,0
True 1,1085,2366,743,113,4,0
True 2,187,710,739,194,30,1
True 3,14,98,169,142,48,1
True 4,2,6,21,37,38,12
True 5,0,0,0,3,8,5


In [84]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      5836
           1       0.56      0.55      0.55      4311
           2       0.40      0.40      0.40      1861
           3       0.28      0.30      0.29       472
           4       0.30      0.33      0.31       116
           5       0.26      0.31      0.29        16

   micro avg       0.62      0.62      0.62     12612
   macro avg       0.43      0.44      0.44     12612
weighted avg       0.62      0.62      0.62     12612



In [85]:
#okay, turns out magnitudes are pretty hard to guess, unknown "-9" values to be eliminated