In [3]:
import pandas as pd
import calendar
import glob

#### Weather Data

In [29]:
def compile_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    # keeping data only of the relevant cities (Adilabad, Nizamabad, Karimnagar, Khammam and Warangal)
    warangal_varaints = ["Warangal (U)", "Warangal (R)", "Warangal", "Warangal Rural", "Warangal Urban"]
    cities = ["Adilabad", "Nizamabad", "Karimnagar", "Khammam"]
    
    warangal_df = df.loc[df["district"].isin(warangal_varaints)]
    df = df.loc[df["district"].isin(cities)]

    # getting month out of odate attribute
    df["month"] = pd.DatetimeIndex(df["odate"], dayfirst=True).month
    df["month"] = df["month"].apply(lambda x: calendar.month_abbr[x])

    # drop unnecessary rows
    df.drop(columns=["mandal","odate"], inplace=True)

    # grouping the df by district and month and getting mean values for every month
    df = df.groupby(by=["district", "month"]).mean().reset_index()

    # doing the same operations for warangal_df
    warangal_df["month"] = pd.DatetimeIndex(warangal_df["odate"], dayfirst=True).month
    warangal_df["month"] = warangal_df["month"].apply(lambda x: calendar.month_abbr[x])
    warangal_df.drop(columns=["mandal", "odate"], inplace=True)
    warangal_df = warangal_df.groupby(by=["district", "month"]).mean().reset_index()

    # getting mean data for warangal
    warangal_df = warangal_df.set_index(["district"]).groupby(by="month").mean().reset_index()
    warangal_df["district"] = "Warangal"

    # appending warangal data back to df
    df = df.append(warangal_df)

    return df

In [40]:
def make_csv_weather(path: str, year: int):
    files_list = glob.glob(path + "/*.csv")
    col_names = ["district", "mandal", "odate", "cumm_rainfall", "temp_min", "temp_max", "humidity_min", "humidity_max", "wind_speed_min", "wind_speed_max"]
    main_df = pd.read_csv(files_list[0], skiprows=1, names=col_names)

    if len(files_list) != 1:
        for i in range(1, len(files_list)):
            data = pd.read_csv(files_list[i], skiprows=1, names=col_names)
            main_df = pd.concat([main_df, data])
    
    main_df.sort_values(by=["district", "mandal", "odate"], inplace=True)
    main_df = compile_weather_data(main_df)
    main_df.to_csv(f'./Weather_Data/monthly_weather_data_{year}.csv', header=True, index=False)

In [41]:
make_csv_weather('./Weather_Data/daily_weather_data_2018/', 2018)
make_csv_weather('./Weather_Data/telangana-weather-data-2019--All-2023-02-14_1908/', 2019)
make_csv_weather('./Weather_Data/telangana-weather-data-2020--All-2023-02-14_1908/', 2020)
make_csv_weather('./Weather_Data/telangana-weather-data-2021-All-2023-02-14_1907/', 2021)
make_csv_weather('./Weather_Data/telangana-weather-data-2022-All-2023-02-14_1907/', 2022)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = pd.DatetimeIndex(df["odate"], dayfirst=True).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["month"].apply(lambda x: calendar.month_abbr[x])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = v

#### Vehicle Purchase Data

In [None]:
def make_csv_vehicles(path: str, year):
    files_list = glob.glob(path + "/*.csv")

    main_df = pd.read_csv(files_list[0], encoding='utf-8')
    for i in range(1, len(files_list)):
        data = pd.read_csv(files_list[i], encoding='utf-8')
        main_df = pd.concat([main_df, data])
    
    main_df.sort_values(by="fromdate", inplace=True)
    main_df.to_csv(f'./Vehicle_Data/vehicle_purchase_data_{year}.csv', header=True, index=False)
    

In [None]:
make_csv_vehicles("./Vehicle_Data/2019/", 2019)
make_csv_vehicles("./Vehicle_Data/2020/", 2020)
make_csv_vehicles("./Vehicle_Data/2021/", 2021)
make_csv_vehicles("./Vehicle_Data/2022/", 2022)

#### Industrial Consumption Data

In [42]:
def compile_industry_data(df: pd.DataFrame) -> pd.DataFrame:
    # dropping unnecessary columns
    df.drop(columns=["Division", "SubDivision", "Section", "Area", "CatCode", "CatDesc", "TotServices", "BilledServices"], inplace=True)

    # keeping only required cities
    df = df.loc[df["Circle"].isin(["ADILABAD", "NIZAMABAD", "KARIMNAGAR", "KHAMMAM", "WARANGAL"])]

    # grouping by district/circle and month and finding total energy consumption
    df = df.groupby(by=["Circle", "Month"]).sum().reset_index()

    return df

In [43]:
def make_csv_industry(path: str, year: int):
    files_list = glob.glob(path + "/*.csv")

    main_df = pd.read_csv(files_list[0])
    main_df["Month"] = files_list[0][67:70]
    for i in range(1, len(files_list)):
        data = pd.read_csv(files_list[i])
        data["Month"] = files_list[i][67:70]
        main_df = pd.concat([main_df, data])
    
    main_df = compile_industry_data(main_df)

    main_df['Circle'] = main_df['Circle'].apply(lambda x: x.title())
    main_df['Month'] = main_df['Month'].apply(lambda x: x.title())

    main_df.to_csv(f'./Industry_Consumption/industrial_consumption_data_{year}.csv', header=["district", "month", "units", "load"], index=False)


In [44]:
make_csv_industry('./Industry_Consumption/2019/', 2019)
make_csv_industry('./Industry_Consumption/2020/', 2020)
make_csv_industry('./Industry_Consumption/2021/', 2021)
make_csv_industry('./Industry_Consumption/2022/', 2022)

#### AQI Data

In [9]:
def compile_aqi_data(sheet_name: str) -> None:
    df = pd.read_excel('./AQI_Data/aqi_data.xlsx', sheet_name=sheet_name, index_col=None)

    # taking average of 2 monitoring stations in warangal
    warangal_df = df.loc[df["Location"].isin(["Kuda, warangal", "Mee-Seva, Warangal"])].drop(columns="Location").mean()
    warangal_df['Location'] = "Warangal"

    df = df.loc[df["Location"].isin(["Nizamabad", "Adilabad", "Karimnagar", "Khammam"])]
    df = df.append(warangal_df, ignore_index=True)

    # taking transpose of df
    df = df.T

    # making first row as the header and dropping it
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)

    # flattening 2D df to 1D with index as Location and Month
    df = df.T.stack().reset_index()

    df.to_csv(f'./AQI_Data/monthly_aqi_data_{sheet_name}.csv', header=["district", "month", "aqi"], index=False)

In [10]:
compile_aqi_data(sheet_name="2017")
compile_aqi_data(sheet_name="2018")
compile_aqi_data(sheet_name="2019")
compile_aqi_data(sheet_name="2020")
compile_aqi_data(sheet_name="2021")
compile_aqi_data(sheet_name="2022")

#### Combining CSVs

In [13]:
def combine_csv(path: str, data: str) -> None:
    files_list = glob.glob(path + "/*.csv")

    main_df = pd.read_csv(files_list[0])
    main_df["year"] = files_list[0][-8:-4]
    for i in range(1, len(files_list)):
        df = pd.read_csv(files_list[i])
        df["year"] = files_list[i][-8:-4]

        main_df = pd.concat([main_df, df])
    
    main_df.to_csv(f"./monthly_{data}_data.csv", header=True, index=False)

In [12]:
combine_csv("./AQI_Data/", "aqi")
combine_csv("./Industry_Consumption/", "industry_consumption")
combine_csv("./Weather_Data/", "weather")