# Clean data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/raw/weatherQN_2021_2025.csv')
data.head(10)

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,2021-01-02 00:00:00,20.0,17.4,85.0,,,350.0,20.2,,1017.9,,
1,2021-01-02 03:00:00,20.0,17.8,87.0,0.2,,343.0,17.6,,1016.1,,
2,2021-01-02 06:00:00,19.8,17.8,88.0,0.1,,339.0,17.6,,1016.7,,
3,2021-01-02 09:00:00,20.2,18.3,89.0,0.2,,347.0,18.7,,1018.4,,
4,2021-01-02 12:00:00,22.0,18.0,78.0,0.1,,3.0,28.4,,1016.4,,
5,2021-01-02 15:00:00,21.4,18.2,82.0,0.2,,2.0,27.4,,1015.0,,
6,2021-01-02 18:00:00,20.4,18.5,89.0,0.4,,350.0,18.0,,1016.3,,
7,2021-01-02 21:00:00,20.2,18.5,90.0,0.2,,344.0,17.6,,1017.6,,
8,2021-01-03 00:00:00,20.1,18.6,91.0,0.3,,345.0,16.2,,1017.3,,
9,2021-01-03 03:00:00,19.9,18.4,91.0,0.1,,333.0,14.0,,1015.2,,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12131 entries, 0 to 12130
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    12131 non-null  object 
 1   temp    12131 non-null  float64
 2   dwpt    12131 non-null  float64
 3   rhum    12131 non-null  float64
 4   prcp    12098 non-null  float64
 5   snow    0 non-null      float64
 6   wdir    12131 non-null  float64
 7   wspd    12131 non-null  float64
 8   wpgt    0 non-null      float64
 9   pres    12131 non-null  float64
 10  tsun    0 non-null      float64
 11  coco    7890 non-null   float64
dtypes: float64(11), object(1)
memory usage: 1.1+ MB


## Drop Null columns

In [4]:
data.drop(columns=['snow', 'wpgt', 'tsun'], inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12131 entries, 0 to 12130
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    12131 non-null  object 
 1   temp    12131 non-null  float64
 2   dwpt    12131 non-null  float64
 3   rhum    12131 non-null  float64
 4   prcp    12098 non-null  float64
 5   wdir    12131 non-null  float64
 6   wspd    12131 non-null  float64
 7   pres    12131 non-null  float64
 8   coco    7890 non-null   float64
dtypes: float64(8), object(1)
memory usage: 853.1+ KB


In [6]:
data[data['coco'].isnull()]

Unnamed: 0,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco
0,2021-01-02 00:00:00,20.0,17.4,85.0,,350.0,20.2,1017.9,
1,2021-01-02 03:00:00,20.0,17.8,87.0,0.2,343.0,17.6,1016.1,
2,2021-01-02 06:00:00,19.8,17.8,88.0,0.1,339.0,17.6,1016.7,
3,2021-01-02 09:00:00,20.2,18.3,89.0,0.2,347.0,18.7,1018.4,
4,2021-01-02 12:00:00,22.0,18.0,78.0,0.1,3.0,28.4,1016.4,
...,...,...,...,...,...,...,...,...,...
11466,2024-12-12 04:00:00,24.9,23.5,92.0,,50.0,7.2,1008.4,
11467,2024-12-12 13:00:00,26.2,23.5,85.0,,20.0,10.8,1009.5,
11468,2024-12-12 22:00:00,26.2,23.1,83.0,,360.0,3.6,1012.4,
11469,2024-12-13 07:00:00,25.8,23.7,88.0,,20.0,10.8,1012.0,


## Fill missing values

### Fill coco with nearest values because it is the weather condition.

In [7]:
data['coco'].bfill(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['coco'].bfill(inplace=True)


In [8]:
data[data['coco'].isnull()]

Unnamed: 0,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco


### Fill prcp with median values

In [9]:
data[data['prcp'].isnull()].shape

(33, 9)

In [10]:
data['prcp'] = data['prcp'].fillna(data['prcp'].median())

In [11]:
data[data['prcp'].isnull()]

Unnamed: 0,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco


## Save cleaned data

In [12]:
data.to_csv('../data/processed/weatherQN_2021_2025_processed.csv', index=False)