# Weather

## Limpieza

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('../data/raw/weather_classification_data.csv')

# Normalize headers to lowercase and remove spaces
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [3]:
# Check for data types
df.dtypes

temperature             float64
humidity                  int64
wind_speed              float64
precipitation_(%)       float64
cloud_cover              object
atmospheric_pressure    float64
uv_index                  int64
season                   object
visibility_(km)         float64
location                 object
weather_type             object
dtype: object

In [4]:
# Show categorical columns
df.select_dtypes(include='object').T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13190,13191,13192,13193,13194,13195,13196,13197,13198,13199
cloud_cover,partly cloudy,partly cloudy,clear,clear,overcast,overcast,overcast,partly cloudy,overcast,clear,...,partly cloudy,clear,clear,overcast,overcast,overcast,cloudy,overcast,overcast,overcast
season,Winter,Spring,Spring,Spring,Winter,Summer,Winter,Winter,Winter,Winter,...,Summer,Summer,Summer,Winter,Spring,Summer,Winter,Autumn,Winter,Autumn
location,inland,inland,mountain,coastal,mountain,inland,inland,inland,mountain,coastal,...,mountain,inland,inland,inland,coastal,mountain,coastal,coastal,inland,mountain
weather_type,Rainy,Cloudy,Sunny,Sunny,Rainy,Cloudy,Snowy,Snowy,Snowy,Sunny,...,Sunny,Sunny,Sunny,Snowy,Cloudy,Rainy,Snowy,Cloudy,Snowy,Rainy


In [5]:
# Show numerical columns
df.select_dtypes(include='number').T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13190,13191,13192,13193,13194,13195,13196,13197,13198,13199
temperature,14.0,39.0,30.0,38.0,27.0,32.0,-2.0,3.0,3.0,28.0,...,30.0,27.0,31.0,-5.0,29.0,10.0,-1.0,30.0,3.0,-5.0
humidity,73.0,96.0,64.0,83.0,74.0,55.0,97.0,85.0,83.0,74.0,...,24.0,48.0,24.0,65.0,62.0,74.0,76.0,77.0,76.0,38.0
wind_speed,9.5,8.5,7.0,1.5,17.0,3.5,8.0,6.0,6.0,8.5,...,3.5,6.5,8.0,15.5,13.0,14.5,3.5,5.5,10.0,0.0
precipitation_(%),82.0,71.0,16.0,82.0,66.0,26.0,86.0,96.0,66.0,107.0,...,16.0,14.0,5.0,50.0,17.0,71.0,23.0,28.0,94.0,92.0
atmospheric_pressure,1010.82,1011.43,1018.72,1026.25,990.67,1010.03,990.87,984.46,999.44,1012.13,...,1017.54,1029.37,1029.61,982.57,1002.81,1003.15,1067.23,1012.69,984.27,1015.37
uv_index,2.0,7.0,5.0,7.0,1.0,2.0,1.0,1.0,0.0,8.0,...,11.0,8.0,8.0,1.0,2.0,1.0,1.0,3.0,0.0,5.0
visibility_(km),3.5,10.0,5.5,1.0,2.5,5.0,4.0,3.5,1.0,7.5,...,6.5,8.0,9.0,5.0,5.0,1.0,6.0,9.0,2.0,10.0


In [6]:
# Check for missing values
df.isnull().sum()

temperature             0
humidity                0
wind_speed              0
precipitation_(%)       0
cloud_cover             0
atmospheric_pressure    0
uv_index                0
season                  0
visibility_(km)         0
location                0
weather_type            0
dtype: int64

In [7]:
# Check for duplicates
df.duplicated().sum()

0

In [8]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
temperature,13200.0,,,,19.127576,17.386327,-25.0,4.0,21.0,31.0,109.0
humidity,13200.0,,,,68.710833,20.194248,20.0,57.0,70.0,84.0,109.0
wind_speed,13200.0,,,,9.832197,6.908704,0.0,5.0,9.0,13.5,48.5
precipitation_(%),13200.0,,,,53.644394,31.946541,0.0,19.0,58.0,82.0,109.0
cloud_cover,13200.0,4.0,overcast,6090.0,,,,,,,
atmospheric_pressure,13200.0,,,,1005.827896,37.199589,800.12,994.8,1007.65,1016.7725,1199.21
uv_index,13200.0,,,,4.005758,3.8566,0.0,1.0,3.0,7.0,14.0
season,13200.0,4.0,Winter,5610.0,,,,,,,
visibility_(km),13200.0,,,,5.462917,3.371499,0.0,3.0,5.0,7.5,20.0
location,13200.0,3.0,inland,4816.0,,,,,,,


In [9]:
# Check for unique values
df.nunique()

temperature              126
humidity                  90
wind_speed                97
precipitation_(%)        110
cloud_cover                4
atmospheric_pressure    5456
uv_index                  15
season                     4
visibility_(km)           41
location                   3
weather_type               4
dtype: int64

In [10]:
df['weather_type'].unique()

array(['Rainy', 'Cloudy', 'Sunny', 'Snowy'], dtype=object)

In [11]:
# Values counts percentage * 100 foreach non-numeric column
print(f"{df['cloud_cover'].value_counts(normalize=True) * 100}\n")
print(f"{df['season'].value_counts(normalize=True) * 100}\n")
print(f"{df['location'].value_counts(normalize=True) * 100}\n")
print(f"{df['weather_type'].value_counts(normalize=True) * 100}\n")

cloud_cover
overcast         46.136364
partly cloudy    34.545455
clear            16.204545
cloudy            3.113636
Name: proportion, dtype: float64

season
Winter    42.500000
Spring    19.681818
Autumn    18.939394
Summer    18.878788
Name: proportion, dtype: float64

location
inland      36.484848
mountain    36.462121
coastal     27.053030
Name: proportion, dtype: float64

weather_type
Rainy     25.0
Cloudy    25.0
Sunny     25.0
Snowy     25.0
Name: proportion, dtype: float64



In [12]:
# save the cleaned data
df.to_csv('../data/processed/weather_classification_data.csv', index=False)