In [None]:
# Importing libraries

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Reading data

data = pd.read_csv("Monthly_data_cmo.csv")
data.head()

In [None]:
data.drop(["state_name"], axis = 1, inplace =True)

# Data Manipulation

-  Removing Duplicate/Erroneous Records

In [None]:
data.Commodity.unique()

**As it can be observed, there are many commodities with same names but are recorded as 2 different commodities pertaining to use of capitals.** <br/>
As for an example: *'Paddy-Unhusked' and 'PADDY-UNHUSKED', 'Bajri' and 'BAJRI', 'Garlic' and 'GARLIC'*  <br/>
**So, I convert all the commodity names to lowercase and merge the records of similar commodities. <br/>**

In [None]:
# Converting all Commodity names to lowercase in order to avoid duplicate entries

data["Commodity"] = [x.lower() for x in data["Commodity"]]

**Without loss of generality, I merge the commodities which register under a single name to one particular APMC on the same date with same minimum, maximum and modal prices.**

In [None]:
# Records of all the commodities which have been registered under same names.

data[data.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)]

**As, it can be observed, registering commodities with all capitals and with Camel case has caused duplicate/erroneous entries.** <br/>
<br/>

In [None]:
data[data.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)].shape


**So, there are 20 such records which need to be merged** <br/>
<br/>
**To merge these records, I add the total arrival in quintals and assign it to a single record and delete the second one.**

In [None]:
# Obtaining the list of indexes which contain duplicate entries

j = np.array(data.loc[data.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)].index)
d = []

In [None]:
# Adding and assigning total arrivals in quintal to one record

for i in range(0, len(j)-1, 2):
    data.loc[j[i], "arrivals_in_qtl"] = data.loc[j[i]]["arrivals_in_qtl"] + data.loc[j[i+1]]["arrivals_in_qtl"]
    d.append(j[i+1])

data.loc[j]

**As it can be observed, values of alternate records starting from first one have been updated**<br/>

**Size of data before dropping extra rows**

In [None]:
data.shape

In [None]:
data.drop(d, inplace = True)

**Size of data after dropping extra rows**

In [None]:
data.shape

**Therefore, 10 duplicate/erroneous records have been dropped**

# Comparison with Minimum Support Price
 - Comparing the commodities whose MS Price is available in data given with their Minimum Price, Maximum Price and Modal Price

In [None]:
# Reading file in which MSPs are given

file = pd.read_csv("CMO_MSP_Mandi.csv")
file.head()

In [None]:
file.shape

In [None]:
data.Year.unique()

In [None]:
file.year.unique()

**As it can be observed, 'file' contains MSPs for years 2012 to 2016, but Min, Max and Modal prices are known for year 2014 to 2016 only.** <br/>
**So, rows having MSPs for years earlier than 2014 can be dropped**

In [None]:
file.drop(file[file["year"] < 2014].index, inplace = True)
file.shape

**More than 50 records dropped**

In [None]:
# Checking for null values in dataframe file
    
file.isnull().sum()

**10 records where MSP is missing** <br/>
**So we remove these records**

In [None]:
file[file["ms_price"].isnull()]

In [None]:
file.dropna(inplace = True)

In [None]:
file.shape

**10 records dropped.**

In [None]:
file.msp_filter.unique()

MSP Filter is same for all records, therefore, it can be dropped.

In [None]:
file.drop(["msp_filter"], axis = 1, inplace =True)

### Now, to compare the prices, I create a new dataframe which contains data only about the commodities for which all 4 prices(Min, Max, Modal, MSP) are known. <br/>
**So, I perform an inner join on the exisitng dataframes**

In [None]:
# Converting all Commodity names to lowercase

file["commodity"] = [x.lower() for x in file["commodity"]]

In [None]:
file[file.duplicated(['commodity', 'year', 'Type'], keep=False)]

Therefore, no duplicate records found

In [None]:
# Renaming column names to be consistent across both files

file.columns = ['Commodity', 'Year', 'Type', 'ms_price']

In [None]:
combined = pd.merge(file, data, on = ['Commodity', 'Year'], how = 'inner')

In [None]:
combined.head()

In [None]:
# Organising dataframe

combined.drop(["Month", "district_name"], axis = 1, inplace =True)
combined= combined[['Commodity', 'APMC', 'Year', 'date', 'min_price', 'modal_price', 'max_price', 'ms_price', 'arrivals_in_qtl']]

In [None]:
combined.head()

### Observations across clusters formed by Commodities and Years

In [None]:
g =  combined.groupby(["Commodity", "APMC"])

In [None]:
# Plotting Minimum, Maximum, Modal and MS prices for each cluster: APMC and Commodity

for k, df in g:
    print(k)
    df.sort_values(by = 'date', inplace = True)
    plt.figure()
    plt.plot( 'date', 'min_price', data=df, marker='+', color='blue', label = "Minimum Price")
    plt.plot( 'date', 'max_price', data=df, marker='s', color='green', label = "Maximum Price")
    plt.plot( 'date', 'modal_price', data=df, marker='o', color='yellow', label="Modal Price")
    plt.plot( 'date', 'ms_price', data=df, marker='*', color='red', label="Minimum Support Price")
    plt.title(k)
    plt.legend()