In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("Monthly_data_cmo.csv")

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.drop(["state_name"], axis = 1, inplace =True)

In [None]:
data02 = data.copy()

In [None]:
data02["Commodity"] = [x.lower() for x in data02["Commodity"]]
data02.head()

In [None]:
data02[data02.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)]

In [None]:
data02[data02.duplicated(['APMC', 'Year', 'Month', 'Commodity', 'min_price'], keep=False)]

In [None]:
# Converting all Commodity names to lowercase in order to avoid duplicate entries
data["Commodity"] = [x.lower() for x in data["Commodity"]]

# Merging records with same APMC, Commodity, Year and Month


In [None]:
data02.index.name 

In [None]:
data02.head()

In [None]:
# Maximum arrival in qtls for each commodity across all APMCs, Years and Months

df01 = data.groupby(["Commodity"])

for com, com_df in df01:
    print(com, "\n")
    print(com_df[["APMC", "Year", "Month", "arrivals_in_qtl"]][com_df["arrivals_in_qtl"] == com_df["arrivals_in_qtl"].max()], "\n\n")

In [None]:
x = list(df01.groups.keys())
y = df01.arrivals_in_qtl.max()

plt.figure(figsize=(20,15))
sns.barplot(x, y)

In [None]:
# Total arrival in quintals of a particular commodity through out the time period.

df02 = data[["Commodity", "date", "arrivals_in_qtl"]]
df02.sort_values(by='date', inplace = True)

g = df02.groupby(["Commodity"])

for com, com_df in g:
    print(com)

In [None]:
df = data.copy()

df["Commodity"] = [x.lower() for x in df["Commodity"]]
df.head()

g = df.groupby("Commodity")

for com, com_df in g:
    print(com_df.loc[com_df[["APMC", "Year", "Month"]].duplicated(), :])

# Removing rows which have been accidently recorded. Every value accept 'Arrivals_in_Qtl' is similar.
# So, I add both the values and merge them into a single record

In [4]:
# Converting all Commodity names to lower case inorder to avoid any discrepancy

data03 = data.copy()
data03["Commodity"] = [x.lower() for x in data03["Commodity"]]
data03.head()

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name
0,Ahmednagar,bajri,2015,April,79,1406,1538,1463,2015-04,Ahmadnagar,Maharashtra
1,Ahmednagar,bajri,2016,April,106,1788,1925,1875,2016-04,Ahmadnagar,Maharashtra
2,Ahmednagar,wheat(husked),2015,April,1253,1572,1890,1731,2015-04,Ahmadnagar,Maharashtra
3,Ahmednagar,wheat(husked),2016,April,387,1750,2220,1999,2016-04,Ahmadnagar,Maharashtra
4,Ahmednagar,sorgum(jawar),2015,April,3825,1600,2200,1900,2015-04,Ahmadnagar,Maharashtra


In [5]:
# Obtaining the columns which seem to have erronous entries

data03[data03.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)]

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name
29431,Nagpur,garlic,2016,June,4901,5000,10000,8750,2016-06,Nagpur,Maharashtra
29449,Nagpur,garlic,2016,June,6897,5000,10000,8750,2016-06,Nagpur,Maharashtra
40288,Pune-Pimpri,garlic,2016,April,12,6000,6000,6000,2016-04,Pune,Maharashtra
40299,Pune-Pimpri,garlic,2016,April,15,6000,6000,6000,2016-04,Pune,Maharashtra
40828,Pune-Pimpri,garlic,2015,August,36,6000,6000,6000,2015-08,Pune,Maharashtra
40846,Pune-Pimpri,garlic,2015,August,27,6000,6000,6000,2015-08,Pune,Maharashtra
41723,Pune-Pimpri,garlic,2016,February,18,6000,6000,6000,2016-02,Pune,Maharashtra
41740,Pune-Pimpri,garlic,2016,February,24,6000,6000,6000,2016-02,Pune,Maharashtra
42146,Pune-Pimpri,garlic,2016,January,3,6000,6000,6000,2016-01,Pune,Maharashtra
42167,Pune-Pimpri,garlic,2016,January,24,6000,6000,6000,2016-01,Pune,Maharashtra


Total 20 records contain erroneous values

Therefore, we add the 'arrivals_in_qtl' for both the records and delete the extra record

In [6]:
j = np.array(data03.loc[data03.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)].index)

In [7]:
d = []

In [8]:
for i in range(0, len(j)-1, 2):
    data03.loc[j[i], "arrivals_in_qtl"] = data03.loc[j[i]]["arrivals_in_qtl"] + data03.loc[j[i+1]]["arrivals_in_qtl"]
    d.append(j[i+1])


In [9]:
data03.loc[j]

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name
29431,Nagpur,garlic,2016,June,11798,5000,10000,8750,2016-06,Nagpur,Maharashtra
29449,Nagpur,garlic,2016,June,6897,5000,10000,8750,2016-06,Nagpur,Maharashtra
40288,Pune-Pimpri,garlic,2016,April,27,6000,6000,6000,2016-04,Pune,Maharashtra
40299,Pune-Pimpri,garlic,2016,April,15,6000,6000,6000,2016-04,Pune,Maharashtra
40828,Pune-Pimpri,garlic,2015,August,63,6000,6000,6000,2015-08,Pune,Maharashtra
40846,Pune-Pimpri,garlic,2015,August,27,6000,6000,6000,2015-08,Pune,Maharashtra
41723,Pune-Pimpri,garlic,2016,February,42,6000,6000,6000,2016-02,Pune,Maharashtra
41740,Pune-Pimpri,garlic,2016,February,24,6000,6000,6000,2016-02,Pune,Maharashtra
42146,Pune-Pimpri,garlic,2016,January,27,6000,6000,6000,2016-01,Pune,Maharashtra
42167,Pune-Pimpri,garlic,2016,January,24,6000,6000,6000,2016-01,Pune,Maharashtra


In [19]:
data03.drop(d, axis = 1, inplace = True)

ValueError: labels [29449 40299 40846 41740 42167 42694 43631 49605 50294 50613] not contained in axis

In [20]:
data03.loc[j]

Unnamed: 0,APMC,Commodity,Year,Month,arrivals_in_qtl,min_price,max_price,modal_price,date,district_name,state_name
29431,Nagpur,garlic,2016,June,11798,5000,10000,8750,2016-06,Nagpur,Maharashtra
29449,Nagpur,garlic,2016,June,6897,5000,10000,8750,2016-06,Nagpur,Maharashtra
40288,Pune-Pimpri,garlic,2016,April,27,6000,6000,6000,2016-04,Pune,Maharashtra
40299,Pune-Pimpri,garlic,2016,April,15,6000,6000,6000,2016-04,Pune,Maharashtra
40828,Pune-Pimpri,garlic,2015,August,63,6000,6000,6000,2015-08,Pune,Maharashtra
40846,Pune-Pimpri,garlic,2015,August,27,6000,6000,6000,2015-08,Pune,Maharashtra
41723,Pune-Pimpri,garlic,2016,February,42,6000,6000,6000,2016-02,Pune,Maharashtra
41740,Pune-Pimpri,garlic,2016,February,24,6000,6000,6000,2016-02,Pune,Maharashtra
42146,Pune-Pimpri,garlic,2016,January,27,6000,6000,6000,2016-01,Pune,Maharashtra
42167,Pune-Pimpri,garlic,2016,January,24,6000,6000,6000,2016-01,Pune,Maharashtra


In [14]:
d

[29449, 40299, 40846, 41740, 42167, 42694, 43631, 49605, 50294, 50613]

In [None]:
# Ten most busiest APMCs

data.APMC.value_counts().head(10)

In [None]:
# Ten least busiest APMCs

data.APMC.value_counts().tail(10)