# Import Libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Read in Raw Data

In [4]:
raw_df = pd.read_csv('raw_df.csv', index_col=[0])

In [5]:
raw_df.columns

Index(['name', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslikemax',
       'feelslikemin', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'preciptype', 'snow', 'snowdepth', 'windgust',
       'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility',
       'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'sunrise',
       'sunset', 'moonphase', 'conditions', 'description', 'icon', 'stations'],
      dtype='object')

# Create Target Variable

In [6]:
#creates boolean column that shows if there was any precipitation at all
raw_df['did_it_rain_today'] = raw_df['precip'].apply(lambda x: 1 if x>0 else 0)
#shifts the did it rain column down to show that it will rain tomorrow
raw_df['target'] = raw_df.groupby('name')['did_it_rain_today'].shift()
#remomving values where we dont have the previous day
df = raw_df[raw_df['target'].notnull()]

In [7]:
df.drop('did_it_rain_today',inplace=True,axis=1)

In [8]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11964 entries, 1 to 997
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   name              11964 non-null  object        
 1   datetime          11964 non-null  datetime64[ns]
 2   tempmax           11964 non-null  float64       
 3   tempmin           11964 non-null  float64       
 4   temp              11964 non-null  float64       
 5   feelslikemax      11964 non-null  float64       
 6   feelslikemin      11964 non-null  float64       
 7   feelslike         11964 non-null  float64       
 8   dew               11964 non-null  float64       
 9   humidity          11964 non-null  float64       
 10  precip            11964 non-null  float64       
 11  precipprob        888 non-null    float64       
 12  precipcover       11076 non-null  float64       
 13  preciptype        441 non-null    object        
 14  snow              11964 

We will split the data into a train, validation, and test set based on the time. 

In [10]:
training_data = df[(df['datetime'] <= '2021-05-28')]
valid_data = df[(df['datetime'] > '2021-05-28') & (df['datetime'] <= '2021-10-26')]
test_data = df[(df['datetime'] > '2021-10-26')]

In [11]:
training_data.to_csv('training_data.csv')
valid_data.to_csv('valid_data.csv')
test_data.to_csv('test_data.csv')