In [2]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = (8.25, 6)

<h1>Thunderstorms in Texas with income</h1>

In [10]:
# load top 500 thunderstorms in texas from 1950 - 2018
data = pd.read_csv("texas_thunderstorm.csv", usecols=[1, 2, 3, 4, 6, 10, 11, 29, 30, 31, 32, 33])
data.info()

data = data.sample(frac=1)
data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 12 columns):
CZ_NAME_STR            353 non-null object
BEGIN_LOCATION         353 non-null object
BEGIN_DATE             353 non-null object
BEGIN_TIME             353 non-null int64
MAGNITUDE              353 non-null float64
DAMAGE_PROPERTY_NUM    353 non-null int64
DAMAGE_CROPS_NUM       353 non-null int64
END_LOCATION           353 non-null object
BEGIN_LAT              353 non-null float64
BEGIN_LON              353 non-null float64
END_LAT                353 non-null float64
END_LON                353 non-null float64
dtypes: float64(5), int64(3), object(4)
memory usage: 33.2+ KB


Unnamed: 0,CZ_NAME_STR,BEGIN_LOCATION,BEGIN_DATE,BEGIN_TIME,MAGNITUDE,DAMAGE_PROPERTY_NUM,DAMAGE_CROPS_NUM,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON
251,ELLIS CO.,MILFORD,5/2/07,1745,61.0,250000,0,MILFORD,32.1300,-96.9500,32.1300,-96.9500
181,HALE CO.,BOONE,6/12/15,1713,70.0,500000,2000000,PLAINVIEW,34.1881,-101.8391,34.1278,-101.5755
225,WISE CO.,BRIDGEPORT MUNI ARPT,6/21/11,37,70.0,300000,0,BRIDGEPORT MUNI ARPT,33.1722,-97.8653,33.1722,-97.8653
67,JOHNSON CO.,LILLIAN,4/23/03,2020,96.0,4000000,0,VENUS,32.4800,-97.2000,32.4300,-97.1200
287,SAN PATRICIO CO.,ST PAUL,5/23/17,1840,61.0,250000,700000,TAFT,28.1293,-97.5686,27.9800,-97.3308
342,LIVE OAK CO.,OAKVILLE,2/23/16,311,61.0,200000,0,OAKVILLE,28.4757,-98.1289,28.4308,-98.0842
320,CAMERON CO.,HARLINGEN AIRPARK AR,5/16/08,300,75.0,200000,0,MONTE GRANDE,26.2189,-97.6798,26.2228,-97.5061
77,SAN PATRICIO CO.,MATHIS,6/5/07,800,50.0,2500000,0,MATHIS,28.1000,-97.8300,28.1000,-97.8300
286,BAILEY CO.,MULESHOE WARREN ARPT,6/9/16,1750,52.0,250000,0,MULESHOE WARREN ARPT,34.3005,-102.7225,34.3005,-102.7225
130,WICHITA CO.,BURKBURNETT,5/10/06,410,65.0,750000,0,BURKBURNETT,34.1000,-98.5700,34.1000,-98.5700


In [8]:
# The outcomes
thunderstorm_outcomes = data.iloc[:, [5]]

list(thunderstorm_outcomes)

['DAMAGE_PROPERTY_NUM']

In [9]:
# The predictors
thunderstorm_predictors = data.drop(['DAMAGE_PROPERTY_NUM',
                                     'BEGIN_LOCATION',
                                     'END_LOCATION',
                                     'CZ_NAME_STR',
                                     'BEGIN_DATE'], axis=1)

# NOTE: for right now crops damage is included, but we may want to see both with and without including this

list(thunderstorm_predictors)

['BEGIN_TIME',
 'MAGNITUDE',
 'DAMAGE_CROPS_NUM',
 'BEGIN_LAT',
 'BEGIN_LON',
 'END_LAT',
 'END_LON']

In [4]:
texas_income_data = pd.read_csv("MACHINE_LEARNING_DATA/county_year/texas_median_county_income_all.csv")
texas_income_data.info()
texas_income_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3302 entries, 0 to 3301
Data columns (total 2 columns):
name        3302 non-null object
estimate    3301 non-null float64
dtypes: float64(1), object(1)
memory usage: 51.7+ KB


Unnamed: 0,name,estimate
0,ANDERSON CO.1990,27752.0
1,ANDREWS CO.1990,31106.0
2,ANGELINA CO.1990,31157.0
3,ARANSAS CO.1990,29087.0
4,ARCHER CO.1990,31425.0
5,ARMSTRONG CO.1990,29270.0
6,ATASCOSA CO.1990,25746.0
7,AUSTIN CO.1990,31527.0
8,BAILEY CO.1990,27861.0
9,BANDERA CO.1990,31641.0


In [18]:
# Add column to input features
# 1990, 2000, 2009 forward 
def parse_date(year):
    if int(year) < 95:
        return '1990'
    elif int(year) >= 95 or int(year) < 5:
        return '2000'
    elif int(year) >= 5 and int(year) < 9:
        return '2009'
    elif int(year) >= 9:
        return '20' + year
    
thunderstorm_predictors['MEAN_INCOME'] = 100
years = [date.split('/')[-1] for date in data['BEGIN_DATE']]
years = [parse_date(year) for year in years]
estimates = []
for i in range(len(thunderstorm_predictors)):
    keyword = data['CZ_NAME_STR'][i] + years[i]
    estimate = texas_income_data[texas_income_data['name'] == keyword].values[0][1]
    estimates.append(estimate)
thunderstorm_predictors['MEAN_INCOME'] = estimates
thunderstorm_predictors

Unnamed: 0,BEGIN_TIME,MAGNITUDE,DAMAGE_CROPS_NUM,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,MEAN_INCOME
112,2055,70.0,100000,27.8312,-97.7279,27.8258,-97.3890,29690.0
203,35,75.0,0,32.3311,-96.1436,32.3311,-96.1436,29690.0
206,2225,66.0,10000000,34.5411,-101.7375,34.5411,-101.7375,29690.0
54,1545,60.0,0,29.9300,-97.9300,29.8800,-97.8800,30532.0
0,1947,69.0,0,31.1344,-97.5247,31.1344,-97.5247,31840.0
150,2315,52.0,0,30.7300,-99.2200,30.7300,-99.2200,30413.0
164,1900,61.0,0,35.2000,-101.8200,35.2000,-101.8200,27110.0
102,1720,70.0,0,32.8800,-100.1300,32.8800,-100.1300,29629.0
290,1903,65.0,0,31.8500,-102.3700,31.8500,-102.3700,30356.0
213,100,87.0,0,30.9000,-100.1000,30.8300,-100.1000,25592.0
