In [39]:
import os
import re
import pathlib
import pandas as pd
import numpy as np
import functools as ft

In [40]:
## Add your path here
current_path = os.getcwd()
main_path = os.path.join(current_path,'dataset')
if not os.path.exists(main_path):
    os.mkdir(main_path)

out_folder = os.path.join(main_path,'combined')
if not os.path.exists(out_folder):
    os.mkdir(out_folder)

path = os.path.join(main_path,'weather data telangana')

In [41]:
def remove_substrings(column_name):
    ## Remove all strings with brackets
    new_name = re.sub(r'\s*\([^)]*\)|\s+$', '', column_name)
    return new_name

In [42]:
def correct_column_names(column_name):
    if column_name in ['cumm_rainfall', 'rainfall', 'rain']:
        return 'rain'
    elif column_name in ['odate','date']:
        return 'date'
    elif column_name in ['min temp', 'min_temp', 'temp_min']:
        return 'temp_min'
    elif column_name in ['max temp', 'max_temp', 'temp_max']:
        return 'temp_max'
    elif column_name in ['min humidity', 'min_humidity', 'humidity_min']:
        return 'humidity_min'
    elif column_name in ['max humidity', 'max_humidity', 'humidity_max']:
        return 'humidity_max'
    elif column_name in ['min wind speed', 'min_wind_speed', 'wind_speed_min']:
        return 'wind_speed_min'
    elif column_name in ['max wind speed', 'max_wind_speed', 'wind_speed_max']:
        return 'wind_speed_max'
    else:
        return column_name

In [43]:
def cleaner(df):
    df = df.loc[:, ~df.columns.str.contains('^(?:Unnamed|mcode|dcode|dmcode|row_id)', case=False)]
    df = df.rename(columns=remove_substrings)
    df = df.rename(columns= correct_column_names)
    df.columns = [elem.lower() for elem in list(df)]
    df = df.assign(date=pd.to_datetime(df['date']))
    df = df.reset_index(drop=True)
    return df

In [44]:
years = os.listdir(path)
for i in years:
    months = os.listdir(os.path.join(path,i))
    df_array = []
    print(i)
    for j in months:
        exact_path = os.path.join(path,i,j)
        try:
            df_array.append(pd.read_csv(exact_path))
        except:
            try:
                df_array.append(pd.read_excel(exact_path))
            except Exception as e:
                print(e)
                
    final_dataframe = pd.DataFrame()
    
    ## Cleaning the dataframes
    for data in df_array:      
        data = cleaner(data)
        final_dataframe = pd.concat([final_dataframe,data],ignore_index=True)        

    final_dataframe.groupby('district')
    final_dataframe.reset_index(drop = True)
    final_dataframe = final_dataframe.fillna(0)
    year_string = 'TS Weather data ' + i + '.csv'
    df_name = os.path.join(out_folder,year_string)
    final_dataframe = final_dataframe.rename(columns= correct_column_names)
    final_dataframe.to_csv(df_name,index=False)
    # print(final_dataframe.head())
    # else:
    #     file = os.listdir(os.path.join(path,i))
    #     for j in file:
    #         print(i)
    #         exact_path = os.path.join(path,i,j)
    #         df = pd.read_csv(exact_path)
    #         df.rename(columns = {'odate':'date'},inplace= True)
    #         df['date'] = pd.to_datetime(df['date'])
    #         file_path = os.path.join(out_folder,' TS Weather data 2018.csv')
    #         df.to_csv(file_path,index = False)

2018
2019
2020
2021
2022


In [45]:
# district_path = r"D:\NASSCOM\dataset\district-group"
district_folder = os.path.join(main_path,'district-group')
if not os.path.exists(district_folder):
    os.mkdir(district_folder)

files = os.listdir(out_folder)
for file in files:
    file_path = os.path.join(out_folder, file)
    district_file = os.path.join(district_folder,file)
    df = pd.read_csv(file_path)
    df = df.groupby(['district','date']).mean()
    df.to_csv(district_file)

In [46]:
files = os.listdir(district_folder)
df = pd.DataFrame()
for f in files:
    file = os.path.join(district_folder,f)
    data = pd.read_csv(file)
    df = pd.concat([df,data],ignore_index=True)
    
full_merged_dataset_dir = os.path.join(main_path,'super_merged_data')
full_merged_output_path = os.path.join(full_merged_dataset_dir,'complete_merged.csv')

if not os.path.exists(full_merged_dataset_dir):
    os.mkdir(full_merged_dataset_dir)

df.to_csv(full_merged_output_path) 