In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Reading data

data = pd.read_csv("Monthly_data_cmo.csv")
data.head()

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name
0,Ahmednagar,Bajri,2015,April,79,1406,1538,1463,2015-04,Ahmadnagar,Maharashtra
1,Ahmednagar,Bajri,2016,April,106,1788,1925,1875,2016-04,Ahmadnagar,Maharashtra
2,Ahmednagar,Wheat(Husked),2015,April,1253,1572,1890,1731,2015-04,Ahmadnagar,Maharashtra
3,Ahmednagar,Wheat(Husked),2016,April,387,1750,2220,1999,2016-04,Ahmadnagar,Maharashtra
4,Ahmednagar,Sorgum(Jawar),2015,April,3825,1600,2200,1900,2015-04,Ahmadnagar,Maharashtra


In [3]:
data.drop(["state_name"], axis = 1, inplace =True)

In [4]:
data.Commodity.unique()

array(['Bajri', 'Wheat(Husked)', 'Sorgum(Jawar)', 'Maize', 'Gram',
       'Horse Gram', 'Matki', 'Pigeon Pea (Tur)', 'Black Gram',
       'Castor Seed', 'Soybean', 'Jaggery', 'Lemon', 'Ginger (Fresh)',
       'Potato', 'Ladies Finger', 'Flower', 'Carrot', 'Cluster Bean',
       'Ghevda', 'Ghosali(Bhaji)', 'Mango(Raw)', 'Cucumber', 'Onion',
       'Bitter Gourd', 'Cabbage', 'Garlic', 'Math (Bhaji)', 'Capsicum',
       'Tomato', 'Brinjal', 'Tamarind', 'Tamarind Seed',
       'Coriander (Dry)', 'Green Chilli', 'Chillies(Red)', 'Mustard',
       'Paddy-Unhusked', 'Hilda', 'Chikoo', 'Cotton',
       'Ground Nut Pods (Dry)', 'Pomegranate', 'Papai', 'Melon',
       'Beet Root', 'Bottle Gourd', 'Dhemse', 'Coriander ', 'Coriander  ',
       'Spinach', 'Shevga', 'Small Gourd', 'Grapes', 'Kharbuj',
       'Green Gram', 'Sunflower', 'Safflower', 'Mango', 'Water Melon',
       'Mosambi', 'Orange', 'Fenugreek', 'Cowpea', 'Green Peas (Dry)',
       'Squash Gourd', 'Maize (Corn.)', 'Chino', 'Curry Lea

**As it can be observed, there are many commodities with same names but are recorded as 2 different commodities pertaining to use of capitals.** <br/>
As for an example: *'Paddy-Unhusked' and 'PADDY-UNHUSKED', 'Bajri' and 'BAJRI', 'Garlic' and 'GARLIC'*  <br/>
**So, I convert all the commodity names to lowercase and merge the records of similar commodities. <br/>**

In [5]:
# Converting all Commodity names to lowercase in order to avoid duplicate entries

data["Commodity"] = [x.lower() for x in data["Commodity"]]

**Without loss of generality, I merge the commodities which register under a single name to one particular APMC on the same date with same minimum, maximum and modal prices.**

In [6]:
# Records of all the commodities which have been registered under same names.

data[data.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)]

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name
29431,Nagpur,garlic,2016,June,4901,5000,10000,8750,2016-06,Nagpur
29449,Nagpur,garlic,2016,June,6897,5000,10000,8750,2016-06,Nagpur
40288,Pune-Pimpri,garlic,2016,April,12,6000,6000,6000,2016-04,Pune
40299,Pune-Pimpri,garlic,2016,April,15,6000,6000,6000,2016-04,Pune
40828,Pune-Pimpri,garlic,2015,August,36,6000,6000,6000,2015-08,Pune
40846,Pune-Pimpri,garlic,2015,August,27,6000,6000,6000,2015-08,Pune
41723,Pune-Pimpri,garlic,2016,February,18,6000,6000,6000,2016-02,Pune
41740,Pune-Pimpri,garlic,2016,February,24,6000,6000,6000,2016-02,Pune
42146,Pune-Pimpri,garlic,2016,January,3,6000,6000,6000,2016-01,Pune
42167,Pune-Pimpri,garlic,2016,January,24,6000,6000,6000,2016-01,Pune


**As, it can be observed, registering commodities with all capitals and with Camel case has caused duplicate/erroneous entries.** <br/>
<br/>

In [7]:
data[data.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)].shape

(20, 10)


**So, there are 20 such records which need to be merged** <br/>
<br/>
**To merge these records, I add the total arrival in quintals and assign it to a single record and delete the second one.**

In [8]:
# Obtaining the list of indexes which contain duplicate entries

j = np.array(data.loc[data.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)].index)
d = []

In [9]:
# Adding and assigning total arrivals in quintal to one record

for i in range(0, len(j)-1, 2):
    data.loc[j[i], "arrivals_in_qtl"] = data.loc[j[i]]["arrivals_in_qtl"] + data.loc[j[i+1]]["arrivals_in_qtl"]
    d.append(j[i+1])

data.loc[j]

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name
29431,Nagpur,garlic,2016,June,11798,5000,10000,8750,2016-06,Nagpur
29449,Nagpur,garlic,2016,June,6897,5000,10000,8750,2016-06,Nagpur
40288,Pune-Pimpri,garlic,2016,April,27,6000,6000,6000,2016-04,Pune
40299,Pune-Pimpri,garlic,2016,April,15,6000,6000,6000,2016-04,Pune
40828,Pune-Pimpri,garlic,2015,August,63,6000,6000,6000,2015-08,Pune
40846,Pune-Pimpri,garlic,2015,August,27,6000,6000,6000,2015-08,Pune
41723,Pune-Pimpri,garlic,2016,February,42,6000,6000,6000,2016-02,Pune
41740,Pune-Pimpri,garlic,2016,February,24,6000,6000,6000,2016-02,Pune
42146,Pune-Pimpri,garlic,2016,January,27,6000,6000,6000,2016-01,Pune
42167,Pune-Pimpri,garlic,2016,January,24,6000,6000,6000,2016-01,Pune


**As it can be observed, values of alternate records starting from first one have been updated**<br/>

**Size of data before dropping extra rows**

In [10]:
data.shape

(62429, 10)

In [11]:
data.drop(d, inplace = True)

**Size of data after dropping extra rows**

In [12]:
data.shape

(62419, 10)

**Therefore, 10 duplicate/erroneous records have been dropped**