In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv('../database/clean_train_data_v3.csv', index_col = 0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 934807 entries, 118582 to 893344
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   LATITUDE                   934807 non-null  float64
 1   LONGITUDE                  934807 non-null  float64
 2   ELEVATION                  934807 non-null  float64
 3   HourlyAltimeterSetting     934807 non-null  float64
 4   HourlyDewPointTemperature  934807 non-null  float64
 5   HourlyDryBulbTemperature   934807 non-null  float64
 6   HourlyPresentWeatherType   398691 non-null  object 
 7   HourlyPressureTendency     934807 non-null  float64
 8   HourlyRelativeHumidity     934807 non-null  float64
 9   HourlySkyConditions        507788 non-null  object 
 10  HourlySeaLevelPressure     934807 non-null  float64
 11  HourlyStationPressure      934807 non-null  float64
 12  HourlyWetBulbTemperature   934807 non-null  float64
 13  HourlyWindDirection     

In [4]:
encode_cols = ["HourlyPresentWeatherType", "HourlySkyConditions"]

In [5]:
for col in encode_cols:
    print(f"Feature: {col}")
    df[col].info()
    print("\n")

Feature: HourlyPresentWeatherType
<class 'pandas.core.series.Series'>
Int64Index: 934807 entries, 118582 to 893344
Series name: HourlyPresentWeatherType
Non-Null Count   Dtype 
--------------   ----- 
398691 non-null  object
dtypes: object(1)
memory usage: 14.3+ MB


Feature: HourlySkyConditions
<class 'pandas.core.series.Series'>
Int64Index: 934807 entries, 118582 to 893344
Series name: HourlySkyConditions
Non-Null Count   Dtype 
--------------   ----- 
507788 non-null  object
dtypes: object(1)
memory usage: 14.3+ MB




In [6]:
for col in encode_cols:
    print(f"Feature: {col}")
    print(df[col].value_counts())
    print("\n")

Feature: HourlyPresentWeatherType
||HZ            282782
||DU             31236
||FG             25866
||FU             23421
||RA             18102
                 ...  
||FU DZ              1
||TS TS              1
||TS DZ              1
||TS FG SHRA         1
||FU RA TS           1
Name: HourlyPresentWeatherType, Length: 72, dtype: int64


Feature: HourlySkyConditions
CLR:00                           106247
0  0                              20604
FEW:02 98                         10672
FEW:02 20                          9725
FEW:02 197                         9181
                                  ...  
BKN:06 1 SCT:03 10 SCT:04 15          1
SCT:04 30 BKN:06 30                   1
SCT:03 34 FEW:01 30 OVC:08 79         1
BKN:05 25 FEW:01 34 BKN:07 79         1
SCT:04s 18 BKN:07s 98                 1
Name: HourlySkyConditions, Length: 15029, dtype: int64




In [7]:
cat_vector_1 = df[encode_cols[0]].value_counts().index
cat_vector_1

Index(['||HZ', '||DU', '||FG', '||FU', '||RA', '||TS', '||DZ', '||SHRA TS',
       '||SHRA', '||HZ DU', '||FU HZ', '||RA TS', '||HZ RA', '||FG FG', '||SH',
       '||FU FG', '||FG RA', '||TS RA', '||DZ DZ', '||HZ DZ', '||s', '||RA RA',
       '||HZ TS', '||DU RA', '||FC', '||TS SHRA TS', '||DU DU', '||SQ',
       '||DZ RA', '||FG SHRA', '||FG DZ', '||FU TS', '||HZ SHRA', '||DRSN',
       '||DU TS', '||DU SHRA', '||SHRASN', '||FZRA', '||DU TS RA',
       '||FG RA TS', '||HZ FG', '||RA SHRA TS', '|FU |', '||FU DU', '||FZDZ',
       '||SG', '||SHSN', '||DU TS SHRA', '||HZ TS SHRA', '||SHRA TS TS',
       '||DZ TS', '||DU DZ', '||HZ TS RA', '||FU FG RA', '||HZ RA TS',
       '||TS DU', '||FU RA', '||TS SHRA', '||HZ SHRA TS', '||TS DU RA',
       '|DZ |', '||DU FG', '||FG DU', '||HZ DZ DZ', '|HZ |', '||PL',
       '||DZ SHRA TS', '||FU DZ', '||TS TS', '||TS DZ', '||TS FG SHRA',
       '||FU RA TS'],
      dtype='object')

In [8]:
cat_vector_1 = list(cat_vector_1)

In [9]:
enc_vector_1 = [cat_vector_1.index(cat) for cat in cat_vector_1]
enc_vector_1 = [i if i<7 else 7 for i in enc_vector_1]

In [11]:
encode_dict_1 = dict(zip(cat_vector_1, enc_vector_1))

In [12]:
encode_dict_1

{'||HZ': 0,
 '||DU': 1,
 '||FG': 2,
 '||FU': 3,
 '||RA': 4,
 '||TS': 5,
 '||DZ': 6,
 '||SHRA TS': 7,
 '||SHRA': 7,
 '||HZ DU': 7,
 '||FU HZ': 7,
 '||RA TS': 7,
 '||HZ RA': 7,
 '||FG FG': 7,
 '||SH': 7,
 '||FU FG': 7,
 '||FG RA': 7,
 '||TS RA': 7,
 '||DZ DZ': 7,
 '||HZ DZ': 7,
 '||s': 7,
 '||RA RA': 7,
 '||HZ TS': 7,
 '||DU RA': 7,
 '||FC': 7,
 '||TS SHRA TS': 7,
 '||DU DU': 7,
 '||SQ': 7,
 '||DZ RA': 7,
 '||FG SHRA': 7,
 '||FG DZ': 7,
 '||FU TS': 7,
 '||HZ SHRA': 7,
 '||DRSN': 7,
 '||DU TS': 7,
 '||DU SHRA': 7,
 '||SHRASN': 7,
 '||FZRA': 7,
 '||DU TS RA': 7,
 '||FG RA TS': 7,
 '||HZ FG': 7,
 '||RA SHRA TS': 7,
 '|FU |': 7,
 '||FU DU': 7,
 '||FZDZ': 7,
 '||SG': 7,
 '||SHSN': 7,
 '||DU TS SHRA': 7,
 '||HZ TS SHRA': 7,
 '||SHRA TS TS': 7,
 '||DZ TS': 7,
 '||DU DZ': 7,
 '||HZ TS RA': 7,
 '||FU FG RA': 7,
 '||HZ RA TS': 7,
 '||TS DU': 7,
 '||FU RA': 7,
 '||TS SHRA': 7,
 '||HZ SHRA TS': 7,
 '||TS DU RA': 7,
 '|DZ |': 7,
 '||DU FG': 7,
 '||FG DU': 7,
 '||HZ DZ DZ': 7,
 '|HZ |': 7,
 '||PL': 7,

In [13]:
# encoded from 0 to 8, 0 to 6 for major categories, 7 for minor, 8 for invalid or missing
encode_dict_1['|HZ |'] = 0
encode_dict_1['|DZ |'] = 6
encode_dict_1['|FU |'] = 3
encode_dict_1['||s'] = 8
encode_dict_1[np.nan] = 8

In [17]:
colm = df[encode_cols[0]].copy()
colm = pd.Series(map(lambda i: encode_dict_1.get(i, 8), colm))
colm.value_counts()

df[encode_cols[0]] = colm

In [23]:
df[encode_cols[1]].value_counts()[0:30]

CLR:00                            106247
0  0                               20604
FEW:02 98                          10672
FEW:02 20                           9725
FEW:02 197                          9181
FEW:02 25                           7464
SCT:03 20                           6462
VV:09                               5573
SCT:04 20 BKN:07 100                5433
SCT:04 20                           5227
SCT:03 98                           4802
FEW:02 20 SCT:04 100                4255
SCT:04 197                          4098
SCT:03 25                           4086
SCT:03 197                          4057
FEW:01 98                           3573
FEW:02 30                           3512
FEW:01 20                           3386
FEW:02 100                          3326
SCT:04 26                           3251
FEW:01 25                           3208
15                                  3153
SCT:04 25                           3141
FEW:02 20 SCT:04 98                 3019
SCT:04 20 SCT:04

In [19]:
cat_vector_2 = df[encode_cols[1]].value_counts().index
cat_vector_2

Index(['CLR:00', '0  0', 'FEW:02 98', 'FEW:02 20', 'FEW:02 197', 'FEW:02 25',
       'SCT:03 20', 'VV:09', 'SCT:04 20 BKN:07 100', 'SCT:04 20',
       ...
       'FEW:01 15 BKN:07 79', 'BKN:05 25 SCT:03 31 OVC:08 79',
       'SCT:03 18 BKN:06 39', 'FEW:02 25 SCT:04 34 SCT:04 197',
       'SCT:04 26 FEW:02 30 BKN:06 98', 'BKN:06 1 SCT:03 10 SCT:04 15',
       'SCT:04 30 BKN:06 30', 'SCT:03 34 FEW:01 30 OVC:08 79',
       'BKN:05 25 FEW:01 34 BKN:07 79', 'SCT:04s 18 BKN:07s 98'],
      dtype='object', length=15029)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 934807 entries, 118582 to 893344
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   LATITUDE                   934807 non-null  float64
 1   LONGITUDE                  934807 non-null  float64
 2   ELEVATION                  934807 non-null  float64
 3   HourlyAltimeterSetting     934807 non-null  float64
 4   HourlyDewPointTemperature  934807 non-null  float64
 5   HourlyDryBulbTemperature   934807 non-null  float64
 6   HourlyPresentWeatherType   934807 non-null  int64  
 7   HourlyPressureTendency     934807 non-null  float64
 8   HourlyRelativeHumidity     934807 non-null  float64
 9   HourlySkyConditions        507788 non-null  object 
 10  HourlySeaLevelPressure     934807 non-null  float64
 11  HourlyStationPressure      934807 non-null  float64
 12  HourlyWetBulbTemperature   934807 non-null  float64
 13  HourlyWindDirection     

In [26]:
df.to_csv("../database/clean_train_data_v4.csv")