In [74]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [75]:

FILE_NAME = "all_new_data.csv" #CSV file that contains the data
NUMBER_OF_ROWS = 296

## Processing Random Weather Data and Tornado Data

In [76]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', NUMBER_OF_ROWS)
pd.set_option('display.min_rows', 20)

In [77]:
df = pd.read_csv(FILE_NAME)

In [78]:
df.columns

Index(['Unnamed: 0', 'datetime', 'temperature', 'wind_speed',
       'surface_solar_radiation', 'relative_humidity', 'surface_pressure',
       'total_precipitation', 'latitude', 'longitude', 'event_id', 'outcome',
       'city'],
      dtype='object')

In [79]:
#df = data.drop(columns=['Unnamed: 0'])
# Get a seto of the event Id's
event_ids = set(df['event_id'].to_numpy())

In [80]:
## Produces a list of dfs
# Each df has the rolling means of the fundamental features
labels = []
list_of_dfs = []
#Use IDs to loop over events 
for i in event_ids:
    # We initialize a df per event
    event_df = pd.DataFrame()
    # Get data for that event 
    event = df.loc[df['event_id'] == i]
    # Exract the label 
    label = event['outcome'].iloc[0]
    # Record the label of the df: 0 or 1
    labels.append(label)
    # Save the dates
#     temp = event['datetime']
    # We only want the features we are processing
    fundamental_features = event.drop(columns=[ 'latitude', 'longitude', 'event_id','outcome', 'Unnamed: 0'], axis = 1)
    # Compute the rolling mean
    event_df = fundamental_features.expanding().mean()  
    # Put the date
#     event_df['Date'] = temp 
    # We want 3 weeks of the last 3 weeks of data
    event_df = event_df.tail(21)
    # We want the last 14 days of this
    event_df = event_df.head(14)
    # Add to list
    list_of_dfs.append(event_df)

In [81]:
list_of_dfs[0]

Unnamed: 0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation
16908,13.474054,3.458378,193.351351,0.75,100303.218649,0.097838
16909,13.443421,3.545,192.947368,0.751053,100246.238158,0.114474
16910,13.447692,3.614359,193.128205,0.749231,100230.75,0.114103
16911,13.47275,3.68625,191.15,0.75225,100209.90675,0.11475
16912,13.509512,3.7,191.634146,0.752439,100197.407317,0.112927
16913,13.568095,3.755238,189.119048,0.754524,100194.999524,0.112143
16914,13.616279,3.82093,190.27907,0.753953,100189.02,0.11
16915,13.639091,3.859091,189.840909,0.753182,100192.401364,0.107955
16916,13.679333,3.823333,189.066667,0.753778,100202.954,0.106222
16917,13.782609,3.796957,190.630435,0.752391,100206.658043,0.10413


In [82]:
len(list_of_dfs)

494

In [83]:
len(list_of_dfs[0])

14

In [84]:
def generateRollingAvgFeatures(df):
    ''' Turns a whole data frame into a line of rolling average features
    '''
    all_lists = []
    #Iterate over ever column
    for column in df.columns[:-1]:
        # Processed list
        # Turn the column of data into a list
        processing_list = df[str(column)].tolist()
        #Loop over the list 
        # Initialize a list to store the 7 processed values 
        processed_list = []
        for i in range(len(processing_list)):
            # Gets the right spots for computing
            if (((i + 1) % 3) == 0):
                feature_point = (processing_list[i] + processing_list[i -1] + processing_list[i-2]) / 3
                processed_list.append(feature_point)
                
        # Here we normalize the list of siumilar data to set up a pattern
        avg = np.average(processed_list)
        std = np.std(processed_list)
        norm = [(float(i) - avg) / std for i in processed_list]
        all_lists += norm
    return all_lists 

In [85]:
# Loop over each Df
processing_data = list_of_dfs.copy()
flag = 0
df = pd.DataFrame()
list_of_lists = []
for df in processing_data:
    if (1):
        #label = getEventType()
        # This will return a row of features for each event 
        x = df
        # TODO:Should this return a list that then becomes a row in the dataframe?
        current = generateRollingAvgFeatures(df)
        list_of_lists.append(current)
        flag +=1

In [86]:
current

[1.1530232174239068,
 0.641874136782381,
 -0.3283674893351349,
 -1.4665298648711642,
 -1.549873229198926,
 -0.02605012979116434,
 0.3663889411419507,
 1.2095344178481395,
 1.499778190295632,
 0.25183684117913646,
 -0.599071200724095,
 -1.152543830750685,
 -1.667212800281316,
 0.1313044882256524,
 0.6592034355036444,
 0.876704876552038,
 0.829240730109786,
 -1.5705354880495443,
 -0.16368357335718117,
 0.9049783312969394]

In [87]:
len(current)

20

In [88]:
len(list_of_lists)

494

In [89]:
nums = range(4)
cols = [f'{v}_{i}' for v in df.columns[:-1] for i in nums]

In [90]:
processed_data = pd.DataFrame(list_of_lists, columns=cols)
# Replace NAN's by 0
processed_data.fillna(0)

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3
0,-1.076066,-0.661297,0.199521,1.537842,-1.596692,-0.086413,0.957058,0.726047,1.521186,-0.476660,-1.198011,0.153484,-1.324223,0.756978,1.153259,-0.586014,1.683832,-0.624051,-0.856771,-0.203009
1,0.939129,-0.497244,-1.398473,0.956587,-1.602543,0.160400,0.293146,1.148997,-1.405262,-0.292440,0.349540,1.348162,1.590765,-0.042888,-0.406741,-1.141136,-0.724939,1.703510,-0.272817,-0.705754
2,-1.112887,-0.863835,0.812928,1.163795,-0.771625,-0.731295,1.684310,-0.181391,-0.925666,-0.620768,-0.110541,1.656976,-0.996492,0.911028,1.085176,-0.999713,-0.265811,0.057275,-1.292930,1.501465
3,-1.691682,0.416287,0.362380,0.913015,-1.499096,-0.318268,0.906362,0.911002,-1.104986,-0.666424,0.266861,1.504550,0.648743,1.054297,-0.138281,-1.564759,-1.012213,-0.298143,-0.351803,1.662159
4,1.235848,0.633330,-0.531690,-1.337489,-1.116156,-0.840759,0.721006,1.235909,1.009420,0.631141,-0.033828,-1.606732,0.708987,1.209636,-1.270408,-0.648215,1.054008,0.875013,-0.601978,-1.327042
5,-1.260756,-0.625114,0.595463,1.290407,-1.155826,-0.808791,1.182384,0.782233,-1.199623,-0.595534,0.352167,1.442990,0.705484,0.811097,0.162145,-1.678726,-0.004002,-0.223275,-1.287130,1.514407
6,1.531201,0.250959,-0.844212,-0.937949,-0.741676,1.389647,0.484984,-1.132955,-1.496036,-0.211382,0.493461,1.213957,1.239936,-0.773545,-1.170059,0.703668,1.123620,0.759341,-0.500965,-1.381996
7,-0.305406,-0.874564,-0.515840,1.695811,1.726882,-0.481264,-0.695195,-0.550424,1.024580,0.896696,-0.573002,-1.348274,-0.738983,-0.955892,0.104513,1.590362,-1.562138,-0.182797,0.916693,0.828242
8,-0.874920,-0.766832,0.015019,1.626733,-0.569896,1.314869,0.540981,-1.285954,-0.793940,-0.922302,0.134838,1.581404,0.983536,0.870988,-0.400759,-1.453764,-0.402239,1.099559,0.743596,-1.440916
9,1.135168,0.706697,-0.413123,-1.428742,-1.226381,-0.444814,0.164149,1.507045,0.344067,-1.711763,0.593879,0.773816,0.517655,1.210312,-0.241792,-1.486175,1.149726,0.728479,-0.501141,-1.377064


In [91]:
len(processed_data)

494

In [92]:
ones = 0 
zeros = 0
for i in range(len(labels)):
    if (labels[i] == 0 ):
        zeros+=1
    else:
        ones +=1
        
print(ones)
print(zeros)

296
198


In [93]:
processed_data['label'] = labels

In [94]:
processed_data_shuffled = processed_data.sample(frac=1)

In [95]:
processed_data_shuffled

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3,label
147,0.382884,1.085619,0.159219,-1.627722,-1.708291,0.588560,0.792718,0.327013,1.520144,0.174281,-0.513109,-1.181317,0.978078,0.878767,-0.405357,-1.451488,1.677487,-0.484580,-0.942651,-0.250256,0.0
24,1.140236,0.671661,-0.354928,-1.456969,1.552852,0.188164,-0.733251,-1.007764,1.168485,0.613050,-0.311165,-1.470370,-1.076039,-0.742648,0.345073,1.473614,1.439829,-0.114489,0.056887,-1.382226,0.0
407,-1.399254,-0.401566,0.540301,1.260519,-0.803739,-0.086797,-0.769216,1.659752,-1.636336,0.050435,0.616432,0.969469,0.987681,-1.502841,-0.305120,0.820279,1.018533,0.620610,-0.034065,-1.605077,1.0
455,0.087096,-1.285616,-0.301123,1.499643,-0.900402,-1.045467,0.654289,1.291580,-0.182778,-1.386299,0.146666,1.422412,-0.260784,1.533900,-0.016586,-1.256530,1.488479,0.293903,-0.657091,-1.125291,1.0
61,-1.425247,-0.381502,0.594379,1.212370,1.171764,0.809113,-1.062448,-0.918428,1.188920,0.789654,-0.936660,-1.041914,-1.455562,-0.389375,0.804506,1.040431,1.378124,0.475860,-0.647971,-1.206013,0.0
71,1.405989,0.428922,-0.638374,-1.196537,-1.709195,0.809386,0.546651,0.353158,1.731676,-0.584391,-0.602380,-0.544905,-0.406557,-0.638163,-0.678002,1.722722,-0.683030,-1.191365,0.514443,1.359952,0.0
12,1.384071,0.477028,-0.680603,-1.180496,-0.688027,-1.114753,0.327397,1.475383,1.036020,0.637843,-0.088964,-1.584899,1.333857,0.560060,-0.708528,-1.185389,-1.284861,-0.664492,0.913065,1.036289,0.0
70,-1.283163,-0.604428,0.622348,1.265244,-0.019617,-1.611663,1.004877,0.626403,-1.432371,-0.256565,0.366853,1.322084,1.722987,-0.650701,-0.664605,-0.407681,0.718674,0.857175,0.080150,-1.656000,0.0
423,-1.056589,-0.732046,1.505922,0.282713,-1.080730,-0.891617,0.771740,1.200607,-1.060628,0.197161,-0.678088,1.541555,0.364808,-0.062771,1.230531,-1.532569,0.698166,0.073467,-1.652596,0.880962,1.0
301,-1.607867,0.015868,0.524630,1.067369,-0.582709,-0.830934,-0.285854,1.699496,-1.253675,-0.536115,0.375661,1.414128,0.182490,1.265940,0.086685,-1.535115,1.398870,0.330738,-0.396827,-1.332780,1.0


## Test Model

### Logistic Regression, no normalization

In [96]:
y = processed_data_shuffled['label']
X = processed_data_shuffled.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [97]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.52


In [98]:
print("regression coefficients: \n", logisticRegr.coef_)

regression coefficients: 
 [[-0.07453049 -0.0943301   0.1109738   0.05788679 -0.13626148  0.18380671
   0.00730941 -0.05485464 -0.49682241  0.11195257  0.63847884 -0.25360901
  -0.27146254  0.23928575  0.16921255 -0.13703577  0.24660715  0.05076118
  -0.43060135  0.13323302]]


### Naive Bayes, no normalization

In [99]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.68


## Helper Function: Computing Features from Ratios

In [57]:
# Takes input two column names of a dataframe and returns the ratio between then returns the ratio of them as a new figure in the dataframe inputed

def create_ratio_columns(numerator, denominator, df):
    ''' numerator, denominator are column names 
        df is the df where the columns are 
    ''' 
    # filter based on names
    f1 = df.filter(like=numerator)
    f2 = df.filter(like=denominator)
    # Init list for each column of ratio
    ratios = list()
    # Iterating through one set of column to find appropriate data
    for i, c in enumerate(f1.columns):
        col = f'{numerator}/{denominator}:{i}'
        r = f1.iloc[:, i].div(f2.iloc[:, i])
        r.name = col
        # ASS 
        ratios.append(r)  
    ratios = pd.concat(ratios, axis=1)
    
    return ratios
    

In [58]:
ratio_df = create_ratio_columns('temp', 'wind', processed_data)
ratio_df.head()

Unnamed: 0,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3
0,0.673935,7.652709,0.208473,2.118102
1,-0.586024,-3.100023,-4.770567,0.832541
2,1.442265,1.181241,0.482647,-6.415955
3,1.128468,-1.307975,0.399818,1.002209
4,-1.107237,-0.753284,-0.737427,-1.082191


In [59]:
new = pd.concat([processed_data, ratio_df], axis=1)

In [60]:
new

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3,label,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3
0,-1.076066,-0.661297,0.199521,1.537842,-1.596692,-0.086413,0.957058,0.726047,1.521186,-0.476660,-1.198011,0.153484,-1.324223,0.756978,1.153259,-0.586014,1.683832,-0.624051,-0.856771,-0.203009,0.0,0.673935,7.652709,0.208473,2.118102
1,0.939129,-0.497244,-1.398473,0.956587,-1.602543,0.160400,0.293146,1.148997,-1.405262,-0.292440,0.349540,1.348162,1.590765,-0.042888,-0.406741,-1.141136,-0.724939,1.703510,-0.272817,-0.705754,0.0,-0.586024,-3.100023,-4.770567,0.832541
2,-1.112887,-0.863835,0.812928,1.163795,-0.771625,-0.731295,1.684310,-0.181391,-0.925666,-0.620768,-0.110541,1.656976,-0.996492,0.911028,1.085176,-0.999713,-0.265811,0.057275,-1.292930,1.501465,0.0,1.442265,1.181241,0.482647,-6.415955
3,-1.691682,0.416287,0.362380,0.913015,-1.499096,-0.318268,0.906362,0.911002,-1.104986,-0.666424,0.266861,1.504550,0.648743,1.054297,-0.138281,-1.564759,-1.012213,-0.298143,-0.351803,1.662159,1.0,1.128468,-1.307975,0.399818,1.002209
4,1.235848,0.633330,-0.531690,-1.337489,-1.116156,-0.840759,0.721006,1.235909,1.009420,0.631141,-0.033828,-1.606732,0.708987,1.209636,-1.270408,-0.648215,1.054008,0.875013,-0.601978,-1.327042,0.0,-1.107237,-0.753284,-0.737427,-1.082191
5,-1.260756,-0.625114,0.595463,1.290407,-1.155826,-0.808791,1.182384,0.782233,-1.199623,-0.595534,0.352167,1.442990,0.705484,0.811097,0.162145,-1.678726,-0.004002,-0.223275,-1.287130,1.514407,0.0,1.090783,0.772899,0.503612,1.649645
6,1.531201,0.250959,-0.844212,-0.937949,-0.741676,1.389647,0.484984,-1.132955,-1.496036,-0.211382,0.493461,1.213957,1.239936,-0.773545,-1.170059,0.703668,1.123620,0.759341,-0.500965,-1.381996,0.0,-2.064514,0.180592,-1.740701,0.827878
7,-0.305406,-0.874564,-0.515840,1.695811,1.726882,-0.481264,-0.695195,-0.550424,1.024580,0.896696,-0.573002,-1.348274,-0.738983,-0.955892,0.104513,1.590362,-1.562138,-0.182797,0.916693,0.828242,0.0,-0.176854,1.817223,0.742008,-3.080919
8,-0.874920,-0.766832,0.015019,1.626733,-0.569896,1.314869,0.540981,-1.285954,-0.793940,-0.922302,0.134838,1.581404,0.983536,0.870988,-0.400759,-1.453764,-0.402239,1.099559,0.743596,-1.440916,0.0,1.535228,-0.583200,0.027762,-1.265001
9,1.135168,0.706697,-0.413123,-1.428742,-1.226381,-0.444814,0.164149,1.507045,0.344067,-1.711763,0.593879,0.773816,0.517655,1.210312,-0.241792,-1.486175,1.149726,0.728479,-0.501141,-1.377064,0.0,-0.925624,-1.588749,-2.516749,-0.948042


In [61]:
## Plot data
## Take any 2 columns and plot how they relate to eachother in a line 
def create_line(df_column1, df_column2):
    ''' We want to be able to visualize the behavior to explore the features more 
    '''
    
    # Find how windspeed and humidity grow together with time. 

In [62]:
## Preparing Model

 ## Find 10 Non Tornadoes 

In [63]:
reduced_dataset = processed_data_shuffled[processed_data_shuffled['label']== 0]

In [64]:
y = reduced_dataset['label']
X = reduced_dataset.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [65]:
predictions = logisticRegr.predict(x_test)

In [66]:
predictions

array([0., 0., 0., 1., 0., 0., 1., 1., 0., 1.])

## Drawing Board

In [67]:
dfb = pd.DataFrame()

In [68]:
type(dfb)

pandas.core.frame.DataFrame

In [69]:
dfb['b'] = [0, 1, 2, 3, 4]

In [70]:
dfb['c'] = [0, 1, 2, 3, 4]

In [71]:
dfb

Unnamed: 0,b,c
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [45]:
dfb.expanding().mean()

Unnamed: 0,b,c
0,0.0,0.0
1,0.5,0.5
2,1.0,1.0
3,1.5,1.5
4,2.0,2.0


In [76]:
x = [0, 1, 2, 3, 4]

In [78]:
(np.average(x))

2.0