In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("Monthly_data_cmo.csv", parse_dates=["date"], index_col='date')

In [None]:
data = pd.read_csv("Monthly_data_cmo.csv")

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.drop(["state_name"], axis = 1, inplace =True)

In [None]:
data02 = data.copy()

In [None]:
data02["Commodity"] = [x.lower() for x in data02["Commodity"]]
data02.head()

In [None]:
data02[data02.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)]

In [None]:
data02[data02.duplicated(['APMC', 'Year', 'Month', 'Commodity', 'min_price'], keep=False)]

In [None]:
# Converting all Commodity names to lowercase in order to avoid duplicate entries
data["Commodity"] = [x.lower() for x in data["Commodity"]]

# Merging records with same APMC, Commodity, Year and Month


In [None]:
data02.index.name 

In [None]:
data02.head()

In [None]:
df02 = data.groupby(["Commodity", "Year"])

In [None]:
# Maximum arrival in qtls for each commodity across all APMCs, Years and Months

df01 = data.groupby(["Commodity"])

for com, com_df in df01:
    print(com, "\n")
    print(com_df[["APMC", "Year", "Month", "arrivals_in_qtl"]][com_df["arrivals_in_qtl"] == com_df["arrivals_in_qtl"].max()], "\n\n")

In [None]:
x = list(df01.groups.keys())
y = df01.arrivals_in_qtl.max()

plt.figure(figsize=(20,15))
sns.barplot(x, y)

In [None]:
# Total arrival in quintals of a particular commodity through out the time period.

df02 = data[["Commodity", "date", "arrivals_in_qtl"]]
df02.sort_values(by='date', inplace = True)

g = df02.groupby(["Commodity"])

for com, com_df in g:
    print(com)

In [None]:
data.index

In [None]:
df = data.copy()

df["Commodity"] = [x.lower() for x in df["Commodity"]]
df.head()

g = df.groupby("Commodity")

for com, com_df in g:
    print(com_df.loc[com_df[["APMC", "Year", "Month"]].duplicated(), :])

# Removing rows which have been accidently recorded. Every value accept 'Arrivals_in_Qtl' is similar.
# So, I add both the values and merge them into a single record

In [None]:
# Converting all Commodity names to lower case inorder to avoid any discrepancy

data03 = data.copy()
data03["Commodity"] = [x.lower() for x in data03["Commodity"]]
data03.head()

In [None]:
# Obtaining the columns which seem to have erronous entries

data03[data03.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)]

Total 20 records contain erroneous values

Therefore, we add the 'arrivals_in_qtl' for both the records and delete the extra record

In [None]:
j = np.array(data03.loc[data03.duplicated(['APMC', 'Commodity', 'date', 'min_price', 'max_price', 'modal_price'], keep=False)].index)

In [None]:
d = []

In [None]:
for i in range(0, len(j)-1, 2):
    data03.loc[j[i], "arrivals_in_qtl"] = data03.loc[j[i]]["arrivals_in_qtl"] + data03.loc[j[i+1]]["arrivals_in_qtl"]
    d.append(j[i+1])


In [None]:
data03.loc[j]

In [None]:
data03.drop(d, inplace = True)

In [None]:
data03.loc[j]

In [None]:
data03 = data03[pd.notnull(data03.APMC)]

In [None]:
data.shape

In [None]:
# Ten most busiest APMCs

data.APMC.value_counts().head(10)

In [None]:
# Ten least busiest APMCs

data.APMC.value_counts().tail(10)

In [None]:
data.head()

# Grouping by [APMC, Commodity]

In [None]:
g = data.groupby(["Commodity"])

In [None]:
data.head()

In [None]:
for k, df in g:
    print(k)
    h = df.groupby("Year")    
    a = h.get_group()
    i = df[df["arrivals_in_qtl"] == df["arrivals_in_qtl"].max()].index
    
    
    #df.sort_values(by = 'date', inplace = True)
    #plt.figure()
    #plt.plot( 'date', 'min_price', data=df, marker='o', markerfacecolor='blue', markersize=12, color='skyblue', linewidth=4)
    #plt.plot( 'date', 'max_price', data=df, marker='', color='olive', linewidth=2)
    #plt.plot( 'date', 'modal_price', data=df, marker='', color='olive', linewidth=2, linestyle='dashed', label="toto")
    #plt.legend()


In [None]:
df

In [None]:
df.sort_values(by = 'date', inplace = True)

In [None]:
ds = data.sample(n = 50, replace = True)

In [None]:
g = ds.groupby(["APMC", "Commodity"])

In [None]:
for k, df in g:
    print(k)
    df.sort_values(by = 'date', inplace = True)
    plt.figure()
    plt.plot( 'date', 'min_price', data=df, marker='o', markerfacecolor='blue', markersize=12, color='skyblue', linewidth=4)
    plt.plot( 'date', 'max_price', data=df, marker='*', color='olive', linewidth=2)
    plt.plot( 'date', 'modal_price', data=df, marker='+', color='red', linewidth=2, linestyle='dashed', label="modal_price")
    plt.title(k)
    plt.legend()

# Using both data files

In [None]:
file = pd.read_csv("CMO_MSP_Mandi.csv")

In [None]:
file.head()

In [None]:
file.commodity.unique()

In [None]:
file.drop(file[file["year"] < 2014].index, inplace = True)

In [None]:
file.shape

In [None]:
file.head()

In [None]:
file.msp_filter.unique()

In [None]:
file.drop(["msp_filter"], axis = 1, inplace =True)

In [None]:
file.commodity.unique().shape

In [None]:
file.head()

In [None]:
data["Commodity"] = [x.lower() for x in data["Commodity"]]

In [None]:
file["commodity"] = [x.lower() for x in file["commodity"]]

In [None]:
file[file.duplicated(['commodity', 'year', 'Type'], keep=False)]

#Therefore, no duplicate records

In [None]:
file.columns = ['Commodity', 'Year', 'Type', 'ms_price']

In [None]:
file.head()

In [None]:
combines = pd.merge(file, data, on = ['Commodity', 'Year'], how = 'inner')

In [None]:
combines.head()

In [None]:
combines.drop(["Month", "district_name"], axis = 1, inplace =True)

In [None]:
combines.head()

In [None]:
combines = combines[['Commodity', 'APMC', 'Year', 'date', 'min_price', 'modal_price', 'max_price', 'ms_price', 'arrivals_in_qtl']]

In [None]:
combines.head()

In [None]:
g =  combines.groupby(["Commodity", "APMC"])

In [None]:
idx = np.array([])
idx

In [None]:
for k, df in g:
    print(k)
    df.sort_values(by = 'date')
    plt.figure()
    plt.plot( 'date', 'min_price', data=df, marker='+', color='blue', label = "Minimum Price")
    plt.plot( 'date', 'max_price', data=df, marker='s', color='green', label = "Maximum Price")
    plt.plot( 'date', 'modal_price', data=df, marker='o', color='yellow', label="Modal Price")
    plt.plot( 'date', 'ms_price', data=df, marker='*', color='red', label="Minimum Support Price")
    plt.title(k)
    plt.legend()

In [None]:
df

In [None]:
    plt.plot( 'date', 'min_price', data=df, marker='+', color='blue', label = "Minimum Price")
    plt.plot( 'date', 'max_price', data=df, marker='s', color='green', label = "Maximum Price")
    plt.plot( 'date', 'modal_price', data=df, marker='o', color='yellow', label="Modal Price")
    plt.plot( 'date', 'ms_price', data=df, marker='*', color='red', label="Minimum Support Price")
    plt.title(k)
    l = plt.legend()
    l

In [None]:
idx = combines[combines["min_price"] < combines["ms_price"]]

In [None]:
idx.head()

In [None]:
idx.shape

In [None]:
idx.Commodity.value_counts()

In [None]:
idx.APMC.value_counts()

In [None]:
file.isnull().sum()

In [None]:
file[file.ms_price.isnull()]

In [None]:
f = file.copy()

In [None]:
f.dropna(inplace = True)

In [None]:
f.shape

In [None]:
file.shape

In [None]:
c = pd.merge(f, data, on=["Commodity", "Year"])

In [None]:
c.shape

In [None]:
combines.shape

In [None]:
combines.head()

In [None]:
data.head()

In [None]:
data.Month.unique().shape

In [None]:
s = ["March", "April", "May", "June"]
w = ["November", "December", "January", "February"]
m = ["July", "August", "September", "October"]

In [None]:
z = data.copy()

In [None]:
sa = []

for a in z.Month:
    print(a)
    if a in ["March", "April", "May", "June"]:
        sa.append("S")
        
    elif a in ["November", "December", "January", "February"]:
        sa.append("W")
        
    else:
        sa.append("M")

In [None]:
z["Season"] = sa

In [None]:
z.head()

In [None]:
j = data.copy()

In [None]:
h = data.groupby(["APMC", "Commodity", "Year"])

In [None]:
for k, df in h:
    print(k)
    df.sort_values(by = 'date')
    pr = df["modal_price"]

In [None]:
for k, df in sdata.groupby(["APMC", "Commodity"]):
    print(k)    
    mi = df["min_price"]
    ma = df["max_price"]
    mo = df["modal_price"]
    if len(mo) > 12:
        midec = seasonal_decompose(mi, model = 'additive', freq = 12)
        madec = seasonal_decompose(ma, model = 'additive', freq = 12)
        modec = seasonal_decompose(mo, model = 'additive', freq = 12)
        midec.plot()
        madec.plot()
        modec.plot()
        plt.title(k)
        plt.show()

# Seasonal Decomposition

In [None]:
data.head()

In [None]:
data["date"] = pd.to_datetime(data["date"])

In [None]:
data = data.set_index(data["date"])

In [None]:
data.head()

In [None]:
data = data.sample(n = 10000)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.index[df["Year"] == 2014]

In [None]:
# line plots for every year
for k, df in data.groupby(["APMC", "Commodity"]):
    
    x14 = df["modal_price"][df["Year"] == 2014]
    y14 = df.index[df["Year"] == 2014]
    
    x15 = df["modal_price"][df["Year"] == 2015]
    y15 = df.index[df["Year"] == 2015]
    
    x16 = df["modal_price"][df["Year"] == 2016]
    y16 = df.index[df["Year"] == 2016]

    plt.figure()
    
    plt.plot(y14, x14, marker='+', color='blue', label = "Modal Price for 2014")
    plt.plot(y15, x15, marker='s', color='green', label = "Modal Price for 2015")
    plt.plot(y16, x16, marker='o', color='yellow', label="Modal Price for 2016")
            
    plt.title(k)
    plt.legend()

In [None]:
data = data.sample(7000)

In [None]:
for k, df in data.groupby(["APMC", "Commodity"]):
    print(k)
    rolmean = df["modal_price"].rolling(3).mean()
    rolstd = df["modal_price"].rolling(3).std()
    
    orig = plt.plot(df["modal_price"], marker='+', color='blue',label='Original')
    mean = plt.plot(rolmean, marker='s', color='red', label='Rolling Mean')
    std = plt.plot(rolstd, marker='o', color='black', label = 'Rolling Std')
    plt.title('Rolling Mean & Standard Deviation')
    plt.legend()

In [None]:
for k, df in data.groupby(["APMC", "Commodity"]):
    dec = seasonal_decompose(df["modal_price"], model = 'multiplicative', freq = 12)
    dec.plot(0)

In [None]:
data.shape[0]

In [None]:
a = data[data["Commodity"] == "Wheat(Husked)"]

In [None]:
a.APMC.value_counts()

In [None]:
z = a[a["APMC"] == "Kalamnuri"]

In [None]:
z

In [None]:
x = seasonal_decompose(z["min_price"], model = 'multiplicative', freq = 12)
x.plot()

# Outlier Detection

In [None]:
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

In [None]:
data = data.sample(10000)

In [None]:
for k, df in data.groupby(["APMC", "Commodity"]):
    print(k)
    data.loc[ outliers_iqr(df["modal_price"]) ]