In [1]:
import pandas as pd

# Load the dataset
file_path = 'weather.csv'
weather_data = pd.read_csv(file_path)

# Display the column names
print(weather_data.columns)


Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')


In [2]:
# Check for missing values
missing_values = weather_data.isnull().sum()
print(missing_values)

# Drop missing values (or handle them as appropriate)
weather_data = weather_data.dropna()


Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64


In [5]:
# Display column names to verify correct names
print("Columns in the dataset after handling missing values:", weather_data.columns)

# Assuming correct column names are 'temperature' and 'humidity' after verification
# Example of conversion if 'temperature' and 'humidity' are initially categorical
temperature_mapping = {'hot': 0, 'cool': 1, 'mild': 2}
humidity_mapping = {'high': 0, 'normal': 1}


Columns in the dataset after handling missing values: Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')


In [6]:
# Verify the column names are correct before mapping
if 'temperature' in weather_data.columns and 'humidity' in weather_data.columns:
    weather_data['temperature'] = weather_data['temperature'].map(temperature_mapping)
    weather_data['humidity'] = weather_data['humidity'].map(humidity_mapping)

    numerical_columns = ['temperature', 'humidity']

    for column in numerical_columns:
        Q1 = weather_data[column].quantile(0.25)
        Q3 = weather_data[column].quantile(0.75)
        IQR = Q3 - Q1

        # Filter out outliers
        weather_data = weather_data[~((weather_data[column] < (Q1 - 1.5 * IQR)) | 
                                      (weather_data[column] > (Q3 + 1.5 * IQR)))]

In [7]:
# Convert other categorical variables to numerical
mappings = {
    'outlook': {'overcast': 0, 'rainy': 1, 'sunny': 2},
    'windy': {False: 0, True: 1},
    'play': {'yes': 1, 'no': 0}
}

In [8]:
# Apply mappings
weather_data_numerical = weather_data.replace(mappings)

In [9]:
# Display the first few rows of the converted dataset
print(weather_data_numerical.head())

                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                   251.0          15.8263         0.0               1015.13  