# Weather Data: Pre-processing

In [1]:
import pandas as pd
from datetime import datetime
from datetime import time

In [2]:
# reading all weather records
weather_22 = pd.read_csv("data/san jose weather 2022.csv")
weather_23 = pd.read_csv("data/san jose weather 2023.csv")
weather_24 = pd.read_csv("data/san jose weather 2024.csv")

In [3]:
weather_data = pd.concat([weather_22, weather_23, weather_24])

### Columns
1. name - city name
2. datetime - date
3. tempmax - predicted actual max temperature of the day          
4. tempmin - predicted actual min temperature of the day
5. temp - actual measured temperature         
6. feelslikemax - max temperature that humans feel
7. feelslikemin - min temperature that humans feel
8. feelslike - temperature that humans feel
9. dew - the temperature point at which the air can hold no more wate       
10. humidity - percentage describing humidity of the day    
11. precip - unknown metrics of measurement of deciamls            
12. precipprob - likelihood of precipitation           
13. precipcover - percent of area it covered     
14. preciptype - rain or snow or hail    
15. snow - checking if it snowed           
16. snowdepth - depth of snow
17. windgust - sudden increase or burst of wind 
18. windspeed - speed of the wind      
19. winddir - direction of the wind    
20. sealevelpressure - unknown units of sealevel pressure
21. cloudcover - percerage of cloudcover
22. visibility - visibility in miles   
23. solarradiation - measured in watts per square meter 
24. solarenergy - measured in kWh/m2/day
25. uvindex - UV Index predicts the ultraviolet radiation levels on a 1-11+ scale
26. severerisk - some kind of risk represented by NaN or 10
27. sunrise - time of sunrise
28. sunset - time of sunset 
29. moonphase - moonphase represented by decimals from 0 to 1
30. conditions - categorical condition about the sky
31. description - more detailed condition of the day
32. icon - icon probably for some weather app or forecast
33. stations - location of the station       

**note: note sure what 'precip' column is**

## 1. Handling Missing Data

In [4]:
weather_data.isna().sum()

name                  0
datetime              0
tempmax               0
tempmin               0
temp                  0
feelslikemax          0
feelslikemin          0
feelslike             0
dew                   0
humidity              0
precip                0
precipprob            0
precipcover           0
preciptype          602
snow                  0
snowdepth             0
windgust              0
windspeed             0
winddir               0
sealevelpressure      0
cloudcover            0
visibility            0
solarradiation        0
solarenergy           0
uvindex               0
severerisk            9
sunrise               0
sunset                0
moonphase             0
conditions            0
description           0
icon                  0
stations              0
dtype: int64

In [5]:
weather_data.preciptype.unique()

array([nan, 'rain'], dtype=object)

In [6]:
# It seems like all the days with no precipitation were filled in NaN values. 
# Let us fill it with string "no precip"

weather_data['preciptype'] = weather_data['preciptype'].fillna('no precip')

In [7]:
# checking what kind of risk measurement this is
weather_data.severerisk.unique()

array([nan, 10.])

In [8]:
# lets check where the severerisk data is missing
weather_data[pd.isna(weather_data.severerisk)]

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,san jose,2022-01-01,55.0,35.1,43.3,55.0,32.5,42.2,31.5,66.8,...,12.1,6,,2022-01-01T07:21:56,2022-01-01T17:00:46,0.97,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
1,san jose,2022-01-02,59.9,33.1,44.0,59.9,29.9,43.1,33.6,68.5,...,12.3,6,,2022-01-02T07:22:04,2022-01-02T17:01:35,0.0,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
2,san jose,2022-01-03,56.0,44.0,50.6,56.0,39.5,49.1,43.4,76.5,...,5.4,3,,2022-01-03T07:22:10,2022-01-03T17:02:24,0.03,Overcast,Cloudy skies throughout the day.,cloudy,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
3,san jose,2022-01-04,61.0,50.0,54.6,61.0,50.0,54.6,49.6,83.5,...,7.4,5,,2022-01-04T07:22:14,2022-01-04T17:03:16,0.07,"Rain, Partially cloudy",Partly cloudy throughout the day with early mo...,rain,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
4,san jose,2022-01-05,62.8,51.1,55.6,62.8,51.1,55.6,50.7,84.1,...,11.1,6,,2022-01-05T07:22:16,2022-01-05T17:04:08,0.1,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
5,san jose,2022-01-06,60.0,53.0,55.8,60.0,53.0,55.8,51.4,85.4,...,6.5,4,,2022-01-06T07:22:16,2022-01-06T17:05:01,0.13,Overcast,Cloudy skies throughout the day.,cloudy,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
6,san jose,2022-01-07,59.0,50.0,53.7,59.0,48.4,53.6,48.3,82.3,...,5.3,3,,2022-01-07T07:22:13,2022-01-07T17:05:56,0.17,Overcast,Cloudy skies throughout the day.,cloudy,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
7,san jose,2022-01-08,57.7,44.0,51.1,57.7,41.4,50.5,45.1,80.2,...,7.8,6,,2022-01-08T07:22:09,2022-01-08T17:06:52,0.2,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."
8,san jose,2022-01-09,61.9,38.9,48.6,61.9,35.5,48.2,41.0,76.6,...,12.5,6,,2022-01-09T07:22:03,2022-01-09T17:07:48,0.25,Partially cloudy,Becoming cloudy in the afternoon.,partly-cloudy-day,"72585093228,KSJC,KLVK,E6873,72492723285,724945..."


Seem like the first 9 days of 2022 are missing the values whereas the others are just simply filled with 10. This column maybe irrelavent information, therefore we are going to drop it the next few lines of code.

## 2. Fixing Datatype

In [9]:
weather_data.dtypes

name                 object
datetime             object
tempmax             float64
tempmin             float64
temp                float64
feelslikemax        float64
feelslikemin        float64
feelslike           float64
dew                 float64
humidity            float64
precip              float64
precipprob            int64
precipcover         float64
preciptype           object
snow                  int64
snowdepth             int64
windgust            float64
windspeed           float64
winddir             float64
sealevelpressure    float64
cloudcover          float64
visibility          float64
solarradiation      float64
solarenergy         float64
uvindex               int64
severerisk          float64
sunrise              object
sunset               object
moonphase           float64
conditions           object
description          object
icon                 object
stations             object
dtype: object

In [10]:
def fix_datatype(df):
    
    
    # datetime column into datetime object
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    
    # get time for sunrise and sunset
    def extract_time(time_string):
        # we only need time since datetime column already has date
        time_string = time_string.split('T')[1]
        return datetime.strptime(time_string, "%H:%M:%S").time()
    
    df['sunrise'] = df['sunrise'].apply(lambda x: extract_time(x))
    df['sunset'] = df['sunset'].apply(lambda x: extract_time(x))
    
    
    # changing percentage to decimal
    df['humidity'] = df['humidity'] / 100
    df['precipcover'] = df['precipcover'] / 100
    df['cloudcover'] = df['cloudcover'] / 100
    
    
    # change into boolean
    df['precipprob'] = df['precipprob'].apply(lambda x: True if x == 100 else False)

In [11]:
# fixing all datatypes
fix_datatype(weather_data)

## 3. Dropping Columns 

In [12]:
# dropping columns that have only one value
# dropping column that doesn't quite help our prediction

useless_columns = ['name', 'dew', 'precipprob', 'solarradiation', 'solarenergy', 
                   'severerisk', 'sealevelpressure', 'icon', 'stations']

weather_data = weather_data.drop(columns=useless_columns)
weather_data.head()

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,humidity,precip,precipcover,...,windspeed,winddir,cloudcover,visibility,uvindex,sunrise,sunset,moonphase,conditions,description
0,2022-01-01,55.0,35.1,43.3,55.0,32.5,42.2,0.668,0.0,0.0,...,5.9,179.7,0.212,8.6,6,07:21:56,17:00:46,0.97,Partially cloudy,Partly cloudy throughout the day.
1,2022-01-02,59.9,33.1,44.0,59.9,29.9,43.1,0.685,0.0,0.0,...,9.1,157.0,0.254,9.9,6,07:22:04,17:01:35,0.0,Partially cloudy,Partly cloudy throughout the day.
2,2022-01-03,56.0,44.0,50.6,56.0,39.5,49.1,0.765,0.0,0.0,...,13.7,143.2,0.919,9.9,3,07:22:10,17:02:24,0.03,Overcast,Cloudy skies throughout the day.
3,2022-01-04,61.0,50.0,54.6,61.0,50.0,54.6,0.835,0.001,0.0417,...,10.2,189.1,0.833,9.9,5,07:22:14,17:03:16,0.07,"Rain, Partially cloudy",Partly cloudy throughout the day with early mo...
4,2022-01-05,62.8,51.1,55.6,62.8,51.1,55.6,0.841,0.0,0.0,...,11.2,327.2,0.861,9.8,6,07:22:16,17:04:08,0.1,Partially cloudy,Partly cloudy throughout the day.


## Final Cleaned Data

In [13]:
weather_data.head()

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,humidity,precip,precipcover,...,windspeed,winddir,cloudcover,visibility,uvindex,sunrise,sunset,moonphase,conditions,description
0,2022-01-01,55.0,35.1,43.3,55.0,32.5,42.2,0.668,0.0,0.0,...,5.9,179.7,0.212,8.6,6,07:21:56,17:00:46,0.97,Partially cloudy,Partly cloudy throughout the day.
1,2022-01-02,59.9,33.1,44.0,59.9,29.9,43.1,0.685,0.0,0.0,...,9.1,157.0,0.254,9.9,6,07:22:04,17:01:35,0.0,Partially cloudy,Partly cloudy throughout the day.
2,2022-01-03,56.0,44.0,50.6,56.0,39.5,49.1,0.765,0.0,0.0,...,13.7,143.2,0.919,9.9,3,07:22:10,17:02:24,0.03,Overcast,Cloudy skies throughout the day.
3,2022-01-04,61.0,50.0,54.6,61.0,50.0,54.6,0.835,0.001,0.0417,...,10.2,189.1,0.833,9.9,5,07:22:14,17:03:16,0.07,"Rain, Partially cloudy",Partly cloudy throughout the day with early mo...
4,2022-01-05,62.8,51.1,55.6,62.8,51.1,55.6,0.841,0.0,0.0,...,11.2,327.2,0.861,9.8,6,07:22:16,17:04:08,0.1,Partially cloudy,Partly cloudy throughout the day.


### Possible Ideas for Feature Engineering:
- Can binzarize all wind related features at a threshold of our choice
- Can find the range of temp (max - min)
- Can bucket some of these columns into broader categories
- Can tokenize the description column to have a smaller number of unique category

## Additional Columns Worth Noting:

In [14]:
weather_data.conditions.value_counts()

Partially cloudy          457
Clear                     180
Rain, Partially cloudy     98
Rain, Overcast             44
Overcast                   11
Rain                        2
Name: conditions, dtype: int64

In [15]:
weather_data.description.value_counts()

Partly cloudy throughout the day.                                             275
Clear conditions throughout the day.                                          179
Clearing in the afternoon.                                                    140
Becoming cloudy in the afternoon.                                              43
Partly cloudy throughout the day with rain.                                    25
Cloudy skies throughout the day with a chance of rain throughout the day.      15
Partly cloudy throughout the day with early morning rain.                      14
Cloudy skies throughout the day with rain.                                     14
Cloudy skies throughout the day.                                               11
Partly cloudy throughout the day with late afternoon rain.                     11
Partly cloudy throughout the day with afternoon rain.                          10
Partly cloudy throughout the day with rain clearing later.                      9
Partly cloudy th

In [16]:
weather_data

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,humidity,precip,precipcover,...,windspeed,winddir,cloudcover,visibility,uvindex,sunrise,sunset,moonphase,conditions,description
0,2022-01-01,55.0,35.1,43.3,55.0,32.5,42.2,0.668,0.000,0.0000,...,5.9,179.7,0.212,8.6,6,07:21:56,17:00:46,0.97,Partially cloudy,Partly cloudy throughout the day.
1,2022-01-02,59.9,33.1,44.0,59.9,29.9,43.1,0.685,0.000,0.0000,...,9.1,157.0,0.254,9.9,6,07:22:04,17:01:35,0.00,Partially cloudy,Partly cloudy throughout the day.
2,2022-01-03,56.0,44.0,50.6,56.0,39.5,49.1,0.765,0.000,0.0000,...,13.7,143.2,0.919,9.9,3,07:22:10,17:02:24,0.03,Overcast,Cloudy skies throughout the day.
3,2022-01-04,61.0,50.0,54.6,61.0,50.0,54.6,0.835,0.001,0.0417,...,10.2,189.1,0.833,9.9,5,07:22:14,17:03:16,0.07,"Rain, Partially cloudy",Partly cloudy throughout the day with early mo...
4,2022-01-05,62.8,51.1,55.6,62.8,51.1,55.6,0.841,0.000,0.0000,...,11.2,327.2,0.861,9.8,6,07:22:16,17:04:08,0.10,Partially cloudy,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2024-02-27,62.9,44.9,54.3,62.9,43.6,54.2,0.579,0.000,0.0000,...,15.8,324.4,0.231,9.9,8,06:41:52,17:59:18,0.61,Partially cloudy,Partly cloudy throughout the day.
58,2024-02-28,69.7,43.9,55.2,69.7,41.5,54.8,0.580,0.000,0.0000,...,13.8,328.6,0.011,9.9,8,06:40:31,18:00:18,0.65,Clear,Clear conditions throughout the day.
59,2024-02-29,62.8,49.8,55.8,62.8,48.3,55.7,0.741,0.189,0.2083,...,18.1,178.3,0.658,9.2,8,06:39:09,18:01:17,0.68,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.
60,2024-03-01,60.1,52.0,55.6,60.1,52.0,55.6,0.743,0.076,0.4583,...,14.9,178.8,0.950,9.3,6,06:37:46,18:02:16,0.71,"Rain, Overcast",Cloudy skies throughout the day with a chance ...
