# 6c - Raw data without NaNs

### Import

In [1]:
import joblib
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from pathlib import Path

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import (
#     accuracy_score,
#     ConfusionMatrixDisplay,
#     f1_score,
#     precision_score,
#     recall_score,
# )
# from sklearn.svm import LinearSVC, SVC
# from sklearn.tree import DecisionTreeClassifier

# from weather.transformers.skl_transformer_makers import (
#     FeatureNames,
#     TargetChoice,
#     make_dataset_ingestion_transformer,
#     make_target_creation_transformer,
#     make_remove_horizonless_rows_transformer, 
#     make_predictors_feature_engineering_transformer,
# )
# from weather.data.prep_datasets import (
#     prepare_binary_classification_tabular_data, 
#     transform_dataset_and_create_target,
# )
# from weather.helpers.utils import camel_to_snake
# from weather.models.skl_train_models import (
#     score_evaluation,
#     confusion_matrix_evaluation,
#     confusion_matrix_display,
# )

### Set the directory paths

In [2]:
data_dir =  Path.cwd().parent / "data"

### Read raw data

In [3]:
df = pd.read_csv(data_dir / 'weather_dataset_raw.csv')
len(df)

96453

In [4]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], utc=True)

In [5]:
df.sort_values(by='Timestamp', inplace=True)
len(df)

96453

In [6]:
df = df.drop_duplicates(subset=["Timestamp"], keep="last")
len(df)

96429

In [7]:
df.set_index('Timestamp', inplace=True)

In [8]:
df_min_timestamp = df.index.min()
df_max_timestamp = df.index.max()
print(f'Minimum index of "df": {df_min_timestamp} \nMaximum index of "df": {df_max_timestamp}')

Minimum index of "df": 2005-12-31 23:00:00+00:00 
Maximum index of "df": 2016-12-31 22:00:00+00:00


In [9]:
regular_timestamp_range = pd.date_range(start=df_min_timestamp, end=df_max_timestamp,freq='H')
print(f"Length of the dataframe `df`: {len(df)}\nLength of the datetime index `regular_timestamp_range`: {len(regular_timestamp_range)}")
if len(df) == len(regular_timestamp_range):
    print("\nNo timestamp was missing in the dataframe.")

Length of the dataframe `df`: 96429
Length of the datetime index `regular_timestamp_range`: 96432


In [10]:
df = df.reindex(regular_timestamp_range, copy=True)
len(df)

96432

In [11]:
df.isna().sum().sum()

35

In [12]:
df[:1] = df.bfill()[:1]

In [13]:
df.ffill(inplace=True)

In [14]:
df.isna().sum().sum()

0

In [15]:
df["Timestamp"] = df.index

In [16]:

df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
2005-12-31 23:00:00+00:00,2880.0,"Port of Turku, Finland",0.577778,-4.05,0.89,17.1143,140.0,9.982,1016.66,rain,2005-12-31 23:00:00+00:00


In [17]:
development_period = (df["Timestamp"] >= "2006") & (df["Timestamp"] < "2011")
dev_df = df[development_period]
dev_df
len(dev_df)

43824

In [18]:
dev_df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
2006-01-01 00:00:00+00:00,2881.0,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139.0,9.9015,1016.15,rain,2006-01-01 00:00:00+00:00


In [19]:
production_period = df["Timestamp"] >= "2011"
prod_df = df[production_period]
len(prod_df)

52607

In [20]:
prod_df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
2011-01-01 00:00:00+00:00,46729.0,"Port of Turku, Finland",-7.1,-7.1,0.96,3.8962,195.0,3.9123,1025.25,snow,2011-01-01 00:00:00+00:00


In [25]:
dev_df.reset_index(drop=True, inplace=True)
dev_df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
0,2881.0,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139.0,9.9015,1016.15,rain,2006-01-01 00:00:00+00:00


In [26]:
prod_df.reset_index(drop=True, inplace=True)
dev_df.head(1)

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions,Timestamp
0,2881.0,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139.0,9.9015,1016.15,rain,2006-01-01 00:00:00+00:00


In [27]:
dev_df.to_csv(data_dir/'weather_dataset_raw_development.csv', index=True)

In [28]:
prod_df.to_csv(data_dir/'weather_dataset_raw_production.csv', index=True)