# Check to see which column to be removed

In [1]:
import pandas as pd 

data = pd.read_csv('data/ori/seoul_1994_1996.csv')

# show one row and all columns
print(data.iloc[0, :])

name                                                            seoul
datetime                                                   1994-01-01
tempmax                                                          35.2
tempmin                                                          16.4
temp                                                             26.3
feelslikemax                                                     33.4
feelslikemin                                                     13.0
feelslike                                                        24.3
dew                                                              15.5
humidity                                                         65.9
precip                                                            0.0
precipprob                                                          0
precipcover                                                       0.0
preciptype                                                        NaN
snow                

In [2]:
# print all the columns
print(data.columns)

Index(['name', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslikemax',
       'feelslikemin', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'preciptype', 'snow', 'snowdepth', 'windgust',
       'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility',
       'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'sunrise',
       'sunset', 'moonphase', 'conditions', 'description', 'icon', 'stations'],
      dtype='object')


In [3]:
# check the category for conditions
print(data['conditions'].value_counts())

conditions
Partially cloudy                392
Rain, Partially cloudy          136
Clear                            90
Rain, Overcast                   65
Snow, Rain, Partially cloudy     18
Overcast                         11
Snow, Partially cloudy           10
Snow, Rain, Overcast              5
Rain                              2
Snow                              1
Snow, Rain                        1
Name: count, dtype: int64


In [4]:
# remove comma in the conditions column
data['conditions'] = data['conditions'].str.replace(',', '')

# check the category for conditions
print(data['conditions'].value_counts())

conditions
Partially cloudy              392
Rain Partially cloudy         136
Clear                          90
Rain Overcast                  65
Snow Rain Partially cloudy     18
Overcast                       11
Snow Partially cloudy          10
Snow Rain Overcast              5
Rain                            2
Snow                            1
Snow Rain                       1
Name: count, dtype: int64


In [5]:
# select datetime, temp, dew, humidity
data = data[['datetime', 'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'windspeed', 'visibility', 'conditions']]
data.head(n=1)

Unnamed: 0,datetime,tempmax,tempmin,temp,dew,humidity,precip,windspeed,visibility,conditions
0,1994-01-01,35.2,16.4,26.3,15.5,65.9,0.0,5.5,6.6,Partially cloudy


In [6]:
# for all the files in the ori folder
# read the file and select datetime, temp, dew, humidity
# save the data to a new file in the new folder called temp
# when save remove the header

import os
import glob

files = glob.glob('data/ori/*.csv')
for file in files:
    data = pd.read_csv(file)
    data = data[['datetime', 'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'windspeed', 'visibility', 'conditions']]
    # remove comma in the conditions column
    data['conditions'] = data['conditions'].str.replace(',', '')
    data.to_csv('data/temp/' + os.path.basename(file), index=False)
print('Data updated in /data/temp folder')

Data updated in /data/temp folder


In [7]:
# for all the files in the ori folder
# read the file and select datetime, temp, dew, humidity
# save all the data as a single file in the new folder called forecast

files = glob.glob('data/ori/*.csv')
all_data = []

for file in files:
    data = pd.read_csv(file)
    data = data[['datetime', 'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'windspeed', 'visibility', 'conditions']]
    # remove comma in the conditions column
    data['conditions'] = data['conditions'].str.replace(',', '')
    all_data.append(data)

# Concatenate all dataframes in the list
all_data = pd.concat(all_data, ignore_index=True)
all_data.to_csv('data/forecast/all.csv', index=False)
print('Data updated in /data/forecast folder')

# print the memory usage of the data in MB
print(all_data.memory_usage().sum() / 1024**2, 'MB')

Data updated in /data/forecast folder
0.8372230529785156 MB
