# 01 - Create the raw `development` and `production` dataseets

__Goal__:
1. Read the project's raw dataset: `"weather_dataset_raw.csv"`;
2. Sort it according to the `Timestamp` variable (remove the `UTC offsets`);
3. Split it into two datasets:
  - the  development dataset, spanning the years `2006-2010`
  - the production dataset, spanning  the years `2011-2016`;
4. Save the `developement` and `production` raw datasets, as `"weather_dataset_raw_develpment.csv"` and `"weather_dataset_raw_production.csv"` respectively.

### Import

In [1]:
import pandas as pd
from pathlib import Path

# 1. Read the raw dataset

In [2]:
# import dataset
df = pd.read_csv(Path('datasets')/'weather_dataset_raw.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   S_No                    96453 non-null  int64  
 1   Timestamp               96453 non-null  object 
 2   Location                96453 non-null  object 
 3   Temperature_C           96453 non-null  float64
 4   Apparent_Temperature_C  96453 non-null  float64
 5   Humidity                96453 non-null  float64
 6   Wind_speed_kmph         96453 non-null  float64
 7   Wind_bearing_degrees    96453 non-null  int64  
 8   Visibility_km           96453 non-null  float64
 9   Pressure_millibars      96453 non-null  float64
 10  Weather_conditions      96448 non-null  object 
dtypes: float64(6), int64(2), object(3)
memory usage: 8.1+ MB


In [4]:
df.head()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
0,0,2006-04-01 00:00:00+02:00,"Port of Turku, Finland",9.472222,7.388889,0.89,14.1197,251,15.8263,1015.13,rain
1,1,2006-04-01 01:00:00+02:00,"Port of Turku, Finland",9.355556,7.227778,0.86,14.2646,259,15.8263,1015.63,rain
2,2,2006-04-01 02:00:00+02:00,"Port of Turku, Finland",9.377778,9.377778,0.89,3.9284,204,14.9569,1015.94,rain
3,3,2006-04-01 03:00:00+02:00,"Port of Turku, Finland",8.288889,5.944444,0.83,14.1036,269,15.8263,1016.41,
4,4,2006-04-01 04:00:00+02:00,"Port of Turku, Finland",8.755556,6.977778,0.83,11.0446,259,15.8263,1016.51,rain


In [5]:
df.tail()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
96448,96269,2016-09-03 08:00:00+02:00,"Port of Turku, Finland",19.483333,19.483333,0.71,7.2772,2,15.1501,1018.9,clear
96449,96272,2016-09-03 11:00:00+02:00,"Port of Turku, Finland",27.616667,27.1,0.36,6.8425,350,15.5526,1019.14,clear
96450,96295,2016-09-30 10:00:00+02:00,"Port of Turku, Finland",17.988889,17.988889,0.64,7.7763,182,15.7297,1020.58,clear
96451,96298,2016-09-30 13:00:00+02:00,"Port of Turku, Finland",25.0,25.0,0.39,12.6063,202,16.1,1019.87,clear
96452,96309,2016-09-04 00:00:00+02:00,"Port of Turku, Finland",17.083333,17.083333,0.73,3.2039,329,16.1,1018.25,clear


# 2. Sort the raw dataset according to the `Timestep` variable

### A. Convert `Timestamp` variable's type to `datetime` and remove `UTC offsets`

In [6]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], utc=True)

### B. Sort the raw dataset

In [7]:
df.sort_values(by='Timestamp', inplace=True)

In [8]:
df.head()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
2828,2880,2005-12-31 23:00:00+00:00,"Port of Turku, Finland",0.577778,-4.05,0.89,17.1143,140,9.982,1016.66,rain
2829,2881,2006-01-01 00:00:00+00:00,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139,9.9015,1016.15,rain
2830,2882,2006-01-01 01:00:00+00:00,"Port of Turku, Finland",1.666667,-3.155556,0.82,20.2538,140,9.9015,1015.87,rain
2831,2883,2006-01-01 02:00:00+00:00,"Port of Turku, Finland",1.711111,-2.194444,0.82,14.49,140,9.9015,1015.56,rain
2832,2884,2006-01-01 03:00:00+00:00,"Port of Turku, Finland",1.183333,-2.744444,0.86,13.9426,134,9.9015,1014.98,rain


In [9]:
df.tail()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
86323,89728,2016-12-31 18:00:00+00:00,"Port of Turku, Finland",0.488889,-2.644444,0.86,9.7566,167,8.0178,1020.03,rain
86324,89729,2016-12-31 19:00:00+00:00,"Port of Turku, Finland",0.072222,-3.05,0.88,9.4185,169,7.245,1020.27,rain
86325,89730,2016-12-31 20:00:00+00:00,"Port of Turku, Finland",-0.233333,-3.377778,0.89,9.2736,175,9.5795,1020.5,snow
86326,89731,2016-12-31 21:00:00+00:00,"Port of Turku, Finland",-0.472222,-3.644444,0.91,9.2414,182,8.4042,1020.65,snow
86327,89732,2016-12-31 22:00:00+00:00,"Port of Turku, Finland",-0.677778,-3.888889,0.92,9.2253,189,8.8711,1020.72,snow


# 3. Split the raw dataset

### A. The `development` dataset

In [10]:
development_period = (df["Timestamp"] >= "2006") & (df["Timestamp"] < "2011")
development_df = df[development_period]
development_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43848 entries, 2829 to 44744
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   S_No                    43848 non-null  int64              
 1   Timestamp               43848 non-null  datetime64[ns, UTC]
 2   Location                43848 non-null  object             
 3   Temperature_C           43848 non-null  float64            
 4   Apparent_Temperature_C  43848 non-null  float64            
 5   Humidity                43848 non-null  float64            
 6   Wind_speed_kmph         43848 non-null  float64            
 7   Wind_bearing_degrees    43848 non-null  int64              
 8   Visibility_km           43848 non-null  float64            
 9   Pressure_millibars      43848 non-null  float64            
 10  Weather_conditions      43843 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(6), int6

In [11]:
print(f'Number of raws of "development_df": {len(development_df)}')

Number of raws of "development_df": 43848


In [12]:
development_df.head()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
2829,2881,2006-01-01 00:00:00+00:00,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139,9.9015,1016.15,rain
2830,2882,2006-01-01 01:00:00+00:00,"Port of Turku, Finland",1.666667,-3.155556,0.82,20.2538,140,9.9015,1015.87,rain
2831,2883,2006-01-01 02:00:00+00:00,"Port of Turku, Finland",1.711111,-2.194444,0.82,14.49,140,9.9015,1015.56,rain
2832,2884,2006-01-01 03:00:00+00:00,"Port of Turku, Finland",1.183333,-2.744444,0.86,13.9426,134,9.9015,1014.98,rain
2833,2885,2006-01-01 04:00:00+00:00,"Port of Turku, Finland",1.205556,-3.072222,0.85,15.9068,149,9.982,1014.08,rain


In [13]:
development_df.tail()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
35506,37148,2010-12-31 19:00:00+00:00,"Port of Turku, Finland",-7.222222,-7.222222,0.96,0.1449,230,3.4293,1025.43,snow
35507,37149,2010-12-31 20:00:00+00:00,"Port of Turku, Finland",-7.2,-7.2,0.96,3.1717,258,3.4293,1025.57,snow
35508,37150,2010-12-31 21:00:00+00:00,"Port of Turku, Finland",-7.244444,-7.244444,0.96,3.3327,311,4.2504,1025.36,snow
35509,37151,2010-12-31 22:00:00+00:00,"Port of Turku, Finland",-7.127778,-7.127778,0.96,3.0751,260,3.8801,1025.59,snow
44744,46728,2010-12-31 23:00:00+00:00,"Port of Turku, Finland",-7.105556,-7.105556,0.96,3.2039,249,3.4132,1025.47,snow


### B. The `production` dataset

In [14]:
production_period = df["Timestamp"] >= "2011"
production_df = df[production_period]
production_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52604 entries, 44745 to 86327
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   S_No                    52604 non-null  int64              
 1   Timestamp               52604 non-null  datetime64[ns, UTC]
 2   Location                52604 non-null  object             
 3   Temperature_C           52604 non-null  float64            
 4   Apparent_Temperature_C  52604 non-null  float64            
 5   Humidity                52604 non-null  float64            
 6   Wind_speed_kmph         52604 non-null  float64            
 7   Wind_bearing_degrees    52604 non-null  int64              
 8   Visibility_km           52604 non-null  float64            
 9   Pressure_millibars      52604 non-null  float64            
 10  Weather_conditions      52604 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(6), int

In [15]:
production_df.head()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
44745,46729,2011-01-01 00:00:00+00:00,"Port of Turku, Finland",-7.1,-7.1,0.96,3.8962,195,3.9123,1025.25,snow
44746,46730,2011-01-01 01:00:00+00:00,"Port of Turku, Finland",-7.061111,-7.061111,1.0,3.2039,171,3.4132,1025.07,snow
44747,46731,2011-01-01 02:00:00+00:00,"Port of Turku, Finland",-6.55,-6.55,0.96,3.2039,161,2.4955,1025.14,snow
44748,46732,2011-01-01 03:00:00+00:00,"Port of Turku, Finland",-6.538889,-9.705556,0.96,6.3917,185,2.6082,1024.62,snow
44749,46733,2011-01-01 04:00:00+00:00,"Port of Turku, Finland",-6.038889,-10.788889,0.93,10.8997,190,2.9624,1024.26,snow


In [16]:
production_df.tail()

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
86323,89728,2016-12-31 18:00:00+00:00,"Port of Turku, Finland",0.488889,-2.644444,0.86,9.7566,167,8.0178,1020.03,rain
86324,89729,2016-12-31 19:00:00+00:00,"Port of Turku, Finland",0.072222,-3.05,0.88,9.4185,169,7.245,1020.27,rain
86325,89730,2016-12-31 20:00:00+00:00,"Port of Turku, Finland",-0.233333,-3.377778,0.89,9.2736,175,9.5795,1020.5,snow
86326,89731,2016-12-31 21:00:00+00:00,"Port of Turku, Finland",-0.472222,-3.644444,0.91,9.2414,182,8.4042,1020.65,snow
86327,89732,2016-12-31 22:00:00+00:00,"Port of Turku, Finland",-0.677778,-3.888889,0.92,9.2253,189,8.8711,1020.72,snow


# 4. Save the `development` and `production` raw datasets

In [17]:
development_df.to_csv(Path('datasets')/'weather_dataset_raw_development.csv', index=False)

In [18]:
production_df.to_csv(Path('datasets')/'weather_dataset_raw_production.csv', index=False)