# 6c - Raw data without NaNs

### Import

In [32]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from pathlib import Path

### Set the directory paths

In [33]:
data_dir =  Path.cwd().parent / "data"

### Read raw data

In [34]:
df = pd.read_csv(data_dir / 'weather_dataset_raw.csv')
len(df)

96453

### Cast `Timestamp` into `datetime`

In [35]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], utc=True)

In [36]:
df.sort_values(by='Timestamp', inplace=True)
len(df)

96453

In [37]:
df = df.drop_duplicates(subset=["Timestamp"], keep="last")
len(df) # 24 duplicates removed 

96429

In [38]:
df.set_index('Timestamp', inplace=True)

In [39]:
df_min_timestamp = df.index.min()
df_max_timestamp = df.index.max()
print(f'Minimum index of "df": {df_min_timestamp} \nMaximum index of "df": {df_max_timestamp}')

Minimum index of "df": 2005-12-31 23:00:00+00:00 
Maximum index of "df": 2016-12-31 22:00:00+00:00


In [40]:
regular_timestamp_range = pd.date_range(start=df_min_timestamp, end=df_max_timestamp,freq='H')
print(f"Length of the dataframe `df`: {len(df)}\nLength of the datetime index `regular_timestamp_range`: {len(regular_timestamp_range)}")
diff = len(regular_timestamp_range) - len(df)
if diff == 0:
    print("\nNo timestamp was missing in the dataframe.")
else:
    print(f"\n{diff} timestamp(s) were missing in the dataframe.")

Length of the dataframe `df`: 96429
Length of the datetime index `regular_timestamp_range`: 96432

3 timestamp(s) were missing in the dataframe.


In [41]:
df = df.reindex(regular_timestamp_range, copy=True)
len(df)

96432

In [42]:
df["Timestamp"] = df.index

In [43]:
df.reset_index(drop=True, inplace=True)
df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
0,2880.0,"Port of Turku, Finland",0.577778,-4.05,0.89,17.1143,140.0,9.982,1016.66,rain,2005-12-31 23:00:00+00:00


### Remove `NaNs`

In [44]:
df.isna().sum().sum()

35

In [45]:
df[:1] = df.bfill()[:1]

In [46]:
df.ffill(inplace=True)

In [47]:
df.isna().sum().sum()

0

### Split data into `dev` and `prod`

In [48]:
dev_period = (df["Timestamp"] >= "2006") & (df["Timestamp"] < "2011")
dev_df = df[dev_period]
dev_df
len(dev_df)

43824

In [49]:
dev_df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
1,2881.0,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139.0,9.9015,1016.15,rain,2006-01-01 00:00:00+00:00


In [50]:
prod_period = df["Timestamp"] >= "2011"
prod_df = df[prod_period]
len(prod_df)

52607

In [51]:
prod_df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
43825,46729.0,"Port of Turku, Finland",-7.1,-7.1,0.96,3.8962,195.0,3.9123,1025.25,snow,2011-01-01 00:00:00+00:00


###  Store `dev` and `prod` data as `csv` files

In [52]:
dev_df.to_csv(data_dir/'weather_dataset_raw_development.csv', index=False)

In [53]:
prod_df.to_csv(data_dir/'weather_dataset_raw_production.csv', index=False)