In [1]:
import pandas as pd

### Extracting proportions of export countries have made

In [2]:
# change these paths as necessary
VOLZA_PATH = "/home/pramod/MS/LabV2/onr_price_prediction/volza/magnesium/magnesium.csv"
SPOT_PRICES_PETROLEUM = "/home/pramod/MS/LabV2/onr_price_prediction/volza/petroleum/petrol_crude_oil_spot_price.csv"

volza = pd.read_csv(VOLZA_PATH)
volza = volza.dropna(subset=["Country of Origin"])
# filter out just the rows where the std. unit is in KG or TNE or TON
volza = volza[volza["Std. Unit"].isin(["KGS", "TNE", "TON"])]
spot_prices = pd.read_csv(SPOT_PRICES_PETROLEUM, sep=";")

In [3]:
volza['Year'] = pd.DatetimeIndex(volza['Date']).year
volza['Month'] = pd.DatetimeIndex(volza['Date']).month

In [4]:
import numpy as np

# defining some constants
proportions_of_sales_by_country_and_months = []
_country_keys = np.unique(volza["Country of Origin"])
months = list(range(1, 13))
years = list(range(2020, 2023))
conversion_factor = {
    "KGS": 1,
    "Kgs": 1,
    "TON": 907.185,
    "TNE": 1000
}

# calculating the proportions
for year in years:
    for month in months:
        volza_month_year = volza[(volza["Month"] == month) & (volza["Year"] == year)]
        curr_row = { country: 0 for country in _country_keys }
        _sum = 0
        for i in range(volza_month_year.shape[0]):
            country = volza.iloc[i]["Country of Origin"]
            quantity = volza.iloc[i]["Std. Quantity"]
            unit = volza.iloc[i]["Std. Unit"]
            curr_row[country] += quantity * conversion_factor[unit]
            _sum += quantity
        for country in _country_keys:
            curr_row[country] = round(curr_row[country] / _sum, 4)
        curr_row["Month"] = month
        curr_row["Year"] = year
        proportions_of_sales_by_country_and_months.append(curr_row)

### Monthly commodity price extraction

In [5]:
commodity_prices = pd.read_csv("/home/pramod/MS/LabV2/onr_price_prediction/volza/magnesium/magnesium_price_2.csv")
commodity_prices['Year'] = pd.DatetimeIndex(commodity_prices['Date']).year
commodity_prices['Month'] = pd.DatetimeIndex(commodity_prices['Date']).month
commodity_prices["Price"] = pd.to_numeric(commodity_prices["Price"].str.replace(",", ""))

_idx = 0
for year in years:
    for month in months:
        monthly_prices = commodity_prices[(commodity_prices["Month"] == month) & (commodity_prices["Year"] == year)]
        proportions_of_sales_by_country_and_months[_idx]["median_commodity_price"] = np.median(monthly_prices["Price"])
        _idx += 1

### Extracting petroleum prices on a monthly basis

In [6]:
START_DATE = "2020-01-01"
END_DATE = "2022-12-31"
spot_prices_filtered = spot_prices[(spot_prices.Date >= START_DATE) & (spot_prices.Date <= END_DATE)]
print(spot_prices_filtered.shape)

(1534, 10)


In [7]:
spot_prices_filtered['Year'] = pd.DatetimeIndex(spot_prices_filtered['Date']).year
spot_prices_filtered['Month'] = pd.DatetimeIndex(spot_prices_filtered['Date']).month

_idx = 0
for year in years:
    for month in months:
        monthly_prices = spot_prices_filtered[(spot_prices_filtered["Month"] == month) & (spot_prices_filtered["Year"] == year)]
        proportions_of_sales_by_country_and_months[_idx]["median_petroleum_price"] = np.median(monthly_prices["Value"])
        _idx += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spot_prices_filtered['Year'] = pd.DatetimeIndex(spot_prices_filtered['Date']).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spot_prices_filtered['Month'] = pd.DatetimeIndex(spot_prices_filtered['Date']).month


### Checking for correlation

In [8]:
proportion_df = pd.DataFrame(proportions_of_sales_by_country_and_months)
proportion_df.drop(columns=["Year", "Month"], inplace=True)

In [9]:
correlations = proportion_df.corr()
correlations.to_csv("/home/pramod/MS/LabV2/onr_price_prediction/analysis/correlations.csv")