In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def read_file(path):
  df = pd.read_csv(path)
  return df

In [3]:
def fogindex_var(df):
  #coverting type of date_time column
  try:
    df["date_time"]=pd.to_datetime(df.date_time, format='%d-%m-%Y %H:%M')
  except:
    df = df.rename(columns={'date_time_IST': 'date_time'})
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['date_time'] = df['date_time'].dt.strftime('%Y-%m-%d %H:%M')
    df['date_time'] = pd.to_datetime(df['date_time'])
  #df["date_time"]=pd.to_datetime(df.date_time_IST)
  # adding fog duration var  ???
  df["endtime"] = df["date_time"].shift(1)
  #df[['date_time', 'endtime']].head()
  df.loc[df.vsbykm<2.0,'fog_duration']=(df['date_time']-df['endtime']).dt.total_seconds()
  # prompt: difference date time in second
  df['diff_in_seconds'] = (df['date_time'] - df['endtime']).dt.total_seconds()
  df.loc[df.vsbykm>=2.0,'fog_duration']=0
  # adding energy_loss var
  df.loc[df.vsbykm<2.0, 'energy_loss']=df['fog_duration']*(1-np.exp(-0.05*3/df['vsbykm']))
  df.loc[df.vsbykm>=2.0,'energy_loss']=0
  df["fog_index"]= df["energy_loss"]/df["fog_duration"]
  #making datetime as index
  df.set_index("date_time",inplace=True)
  df.index = pd.to_datetime(df.index, errors='coerce')
  #df.index = df.index.strftime('%d-%m-%Y %H:%M')
  return df

**Resampling to 3hr interval**

In [4]:
def resampling_data(df_new,df,re_hour):
  df.index = pd.to_datetime(df.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  df_new["avg_air_temp"]=df["tmpc"].resample(re_hour,offset="4h").mean()
  df_new["avg_dew_point"]=df["dwpc"].resample(re_hour,offset="4h").mean()
  df_new["avg_relative_humidity"] = df["relh"].resample(re_hour,offset="4h").mean()
  df_new["avg_pressure"] = df["pressure"].resample(re_hour,offset="4h").mean()
  df_new["avg_visibility"] = df["vsbykm"].resample(re_hour,offset="4h").mean()
  df_new["energy_loss"]=df.resample(re_hour,offset="4h").agg({'energy_loss':'sum'})
  df_new["fog_duration"]=df.resample(re_hour,offset="4h").agg({'fog_duration':'sum'})
  #df_new.index = pd.to_datetime(df_new.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  #df_new.index = df_new.index.strftime('%d-%m-%Y %H:%M')
  df_new = df_new.sort_index()
  return df_new

In [5]:
def new_var(df_new):
  #Adding Fog_month variable
  df_new.index = pd.to_datetime(df_new.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  # df_new.index = df_new.index.strftime('%d-%m-%Y %H:%M')
  df_new = df_new.sort_index()
  df_new["fog_month"]=0
  df_new.loc[(df_new.index.month==12)|(df_new.index.month==1),"fog_month"]=1
  # fog_index variable for 6 hours
  df_new['fog_index']=df_new['energy_loss']/df_new['fog_duration']
  #df_new = df_new.dropna(subset=['fog_index'])
  df_new = df_new.dropna(subset=['avg_air_temp'])
  df_new.loc[df_new.fog_duration==0,'fog_index']=0
  #df_new.index = df_new.index.strftime('%d-%m-%Y %H:%M')
  df_new = df_new.sort_index()
  return df_new

In [6]:
def target_var(df_new,resampling_type,split_type,location):
  common_path = "/content/drive/MyDrive/Fog Prediction 24/Long Term Fog Prediction/"
  # commenting  - not necessary for 12hr resampling
  df_new['fog_index_6h'] = np.nan
  df_new.index = pd.to_datetime(df_new.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  # df_new.index = df_new.index.strftime('%d-%m-%Y %H:%M')
  df_new = df_new.sort_index()
  for row in df_new.itertuples():
    hr = row.Index + pd.Timedelta('6hours')
    # print(hr)
    #hr = hr.strftime('%d-%m-%Y %H:%M')
    #k = df_new.index.get_loc(hr,method='nearest', tolerance=pd.Timedelta('6hours'))
    k = df_new.index.get_indexer([hr], method='nearest', tolerance=pd.Timedelta('6hours'))[0] # Use get_indexer instead of get_loc
    df_new.loc[row.Index, 'fog_index_6h'] = df_new.iloc[k].fog_index
  df_winter = df_new.loc[(df_new.index.month==12)|(df_new.index.month==1)|(df_new.index.month==11)|(df_new.index.month==2)]
  df_winter.dropna(inplace=True)
  file_name_s6 = common_path+ "6hr_dataset/"+ resampling_type + location +split_type+ "_6h.csv"
  df_winter.to_csv(file_name_s6)
  # Update above for 1d
  df_new['fog_index_1d'] = np.nan
  #df_new.index = pd.to_datetime(df_new.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  df_new = df_new.sort_index()
  for row in df_new.itertuples():
        hr = row.Index + pd.Timedelta('1 day')
        #hr = hr.strftime('%d-%m-%Y %H:%M')
        k = df_new.index.get_indexer([hr], method='nearest', tolerance=pd.Timedelta('6hours'))[0] # Use get_indexer instead of get_loc
        df_new.loc[row.Index, 'fog_index_1d'] = df_new.iloc[k].fog_index
  df_winter = df_new.loc[(df_new.index.month==12)|(df_new.index.month==1)|(df_new.index.month==11)|(df_new.index.month==2)]
  df_winter.dropna(inplace=True)
  file_name_s1d = common_path+ "1day_lead_time/"+ resampling_type + location +split_type+ "_1d.csv"
  df_winter.to_csv(file_name_s1d)
  # update above for 3d
  df_new['fog_index_3d'] = np.nan
  #df_new.index = pd.to_datetime(df_new.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  df_new = df_new.sort_index()
  for row in df_new.itertuples():
        hr = row.Index + pd.Timedelta('3 days')
        #hr = hr.strftime('%d-%m-%Y %H:%M')
        k = df_new.index.get_indexer([hr], method='nearest', tolerance=pd.Timedelta('6hours'))[0] # Use get_indexer instead of get_loc
        df_new.loc[row.Index, 'fog_index_3d'] = df_new.iloc[k].fog_index
  df_winter = df_new.loc[(df_new.index.month==12)|(df_new.index.month==1)|(df_new.index.month==11)|(df_new.index.month==2)]
  df_winter.dropna(inplace=True)
  file_name_s3d = common_path+ "3day_lead_time/"+ resampling_type + location +split_type+ "_3d.csv"
  df_winter.to_csv(file_name_s3d)
  # update above for 5d
  df_new['fog_index_5d'] = np.nan
  #df_new.index = pd.to_datetime(df_new.index, format = '%d-%m-%Y %H:%M',errors='coerce')
  df_new = df_new.sort_index()
  for row in df_new.itertuples():
        hr = row.Index + pd.Timedelta('5 days')
        #hr = hr.strftime('%d-%m-%Y %H:%M')
        k = df_new.index.get_indexer([hr], method='nearest', tolerance=pd.Timedelta('6hours'))[0] # Use get_indexer instead of get_loc
        df_new.loc[row.Index, 'fog_index_5d'] = df_new.iloc[k].fog_index
  df_winter = df_new.loc[(df_new.index.month==12)|(df_new.index.month==1)|(df_new.index.month==11)|(df_new.index.month==2)]
  df_winter.dropna(inplace=True)
  file_name_s5d = common_path+ "5day_lead_time/"+ resampling_type + location +split_type+ "_5d.csv"
  df_winter.to_csv(file_name_s5d)
  return df_new

In [7]:
# paths = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/train_set_VIDP.csv"
# df = read_file(paths)
# df['date_time_IST'] = pd.to_datetime(df['date_time_IST'])
# df['date_time'] = df['date_time_IST'].dt.strftime('%Y-%m-%d %H:%M')
# # rename date_time_IST to date_time

# print(df.head(2))

In [8]:
# # Main file
if __name__=="__main__":
  # train_path = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/train_set.csv"
  # test_path = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/test_set.csv"
  # val_path = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/val_set.csv"
  train_path = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/train_set_VIJP.csv"
  test_path = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/test_set_VIJP.csv"
  val_path = "/content/drive/MyDrive/Fog Prediction 24/Raw_data/val_set_VIJP.csv"
  paths=[train_path,test_path,val_path]
  splits_type = ["train","test","val"]
  # for path in [train_path,test_path,val_path]:
  for (paths, splits_type) in zip(paths, splits_type):
    df = read_file(paths)
    df = fogindex_var(df)
    df_new=pd.DataFrame()
    df_new = resampling_data(df_new,df,"6h")
    df_new = new_var(df_new)
    df_new = target_var(df_new,'6hr_re1',splits_type,"_Jaipur_")
    #df_new = target_var(df_new,2,8,24,40,'3hr_re',splits_type)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_winter.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_winter.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_winter.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_winter.dropna(inplace=True)
A value is trying to be set on a copy of a s