1. Data Ingestion:

installing required packages 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


loading the data set to my jupyter notebook environment

In [8]:
data = pd.read_csv("weather_data.csv")
data.head(15)

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,01/02/2023,New York,,65.0,12.0,Cloudy
2,03-01-2023,New York,7.0,,8.0,Rainy
3,,London,8.0,70.0,15.0,Unknown
4,2023-01-02,London,6.0,75.0,20.0,Snowy
5,01/03/2023,London,,80.0,18.0,Cloudy
6,2023-01-01,Tokyo,10.0,50.0,5.0,Sunny
7,01/02/2023,Tokyo,12.0,55.0,6.0,Cloudy
8,03-01-2023,Tokyo,,60.0,7.0,Unknown
9,01-25-2023,New York,,51.0,20.3,Sunny


2. Data Cleaning and Transformation:

In [9]:
# displaying the columns
print(data.columns)
# displaying the data types of each column
print(data.dtypes)

Index(['date', 'city', 'temperature_celsius', 'humidity_percent',
       'wind_speed_kph', 'weather_condition'],
      dtype='object')
date                    object
city                    object
temperature_celsius    float64
humidity_percent       float64
wind_speed_kph         float64
weather_condition       object
dtype: object


In [14]:
columns = data.columns
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)
total_missing_values = missing_values.sum()
print(f'Total missing values from all the dataset = {total_missing_values}')

Missing values in each column:
date                   20
city                    0
temperature_celsius    58
humidity_percent       53
wind_speed_kph         45
weather_condition      16
dtype: int64
Total missing values from all the dataset = 192


Now lets handle the missing values in the data set

In [17]:
# handling missing values with mean imputation for columns with numerical data and store it in a imputed dataset
imputed_data = data.copy()
for column in columns:
    if data[column].dtype == 'float64' or data[column].dtype == 'int64':
        mean_value = data[column].mean()
        imputed_data[column].fillna(mean_value, inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imputed_data[column].fillna(mean_value, inplace=True)


In [18]:
imputed_data

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.000000,10.000000,Sunny
1,01/02/2023,New York,8.4,65.000000,12.000000,Cloudy
2,03-01-2023,New York,7.0,58.978723,8.000000,Rainy
3,,London,8.0,70.000000,15.000000,Unknown
4,2023-01-02,London,6.0,75.000000,20.000000,Snowy
...,...,...,...,...,...,...
95,01-01-2023,London,8.4,58.978723,14.352727,Rainy
96,09-01-2023,London,8.4,58.978723,14.352727,Rainy
97,2023-01-11,Tokyo,8.4,58.978723,14.352727,Sunny
98,15/01/2023,New York,8.4,41.000000,24.300000,


converting date to date time fromat 

In [None]:
# converting the datae column to datetime format
imputed_data['date'] = pd.to_datetime(imputed_data['date'], dayfirst= True errors= 'coerce')
imputed_data

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.000000,10.000000,Sunny
1,2023-02-01,New York,8.4,65.000000,12.000000,Cloudy
2,2023-01-03,New York,7.0,58.978723,8.000000,Rainy
3,NaT,London,8.0,70.000000,15.000000,Unknown
4,2023-01-02,London,6.0,75.000000,20.000000,Snowy
...,...,...,...,...,...,...
95,2023-01-01,London,8.4,58.978723,14.352727,Rainy
96,2023-01-09,London,8.4,58.978723,14.352727,Rainy
97,2023-01-11,Tokyo,8.4,58.978723,14.352727,Sunny
98,2023-01-15,New York,8.4,41.000000,24.300000,


In [28]:
# before storing frahnheit values in the new col lets assign the value to 0 since we are going to sotre numeric values
data[' temperature_fahrenheit'] = 0
data[' temperature_fahrenheit'] = data['temperature_celsius'] * 9/5 + 32
print(data[' temperature_fahrenheit'].head(10))

0    41.00
1    47.12
2    44.60
3    46.40
4    42.80
5    47.12
6    50.00
7    53.60
8    47.12
9    47.12
Name:  temperature_fahrenheit, dtype: float64


In [30]:
# now lets drop where weather condition is null or unknown
# for na values 
cleaned_data = imputed_data.dropna(subset=['weather_condition'])
# for unknown values
cleaned_data = cleaned_data[cleaned_data['weather_condition'] != 'unknown']
cleaned_data

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.000000,10.000000,Sunny
1,2023-02-01,New York,8.4,65.000000,12.000000,Cloudy
2,2023-01-03,New York,7.0,58.978723,8.000000,Rainy
3,NaT,London,8.0,70.000000,15.000000,Unknown
4,2023-01-02,London,6.0,75.000000,20.000000,Snowy
...,...,...,...,...,...,...
93,NaT,London,8.4,76.000000,8.800000,Unknown
95,2023-01-01,London,8.4,58.978723,14.352727,Rainy
96,2023-01-09,London,8.4,58.978723,14.352727,Rainy
97,2023-01-11,Tokyo,8.4,58.978723,14.352727,Sunny


In [None]:
cleaned_data.isnull().any(axis=1)
cleaned_data # only displayes missing values from the date col so no need to drop rows since other columns are not affectedand no country data is missing
