# Import Modules

In [190]:
import numpy as np
import pandas as pd
import seaborn as sns

import plotly.express as px

***
# Initialize and load dataset

In [2]:
raw_df = pd.read_csv('../dataset/Price of Tomato Karnataka(2016-2018).csv')

***
# Show dataframe in table

In [3]:
raw_df.head(10)

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
0,Bagepalli,2/1/2016,6.0,Tomato,1000,1800,1550
1,Bagepalli,4/1/2016,6.0,Tomato,900,1800,1550
2,Bagepalli,5/1/2016,6.0,Tomato,1400,2500,2200
3,Bagepalli,6/1/2016,4.0,Tomato,1400,3000,2400
4,Bagepalli,7/1/2016,5.0,Tomato,1400,2600,2200
5,Bagepalli,11/1/2016,5.0,Tomato,1400,2800,2200
6,Bagepalli,12/1/2016,6.0,Tomato,1500,2400,2000
7,Bagepalli,13/01/2016,7.0,Tomato,1400,2600,2200
8,Bagepalli,14/01/2016,7.0,Tomato,1200,2200,2000
9,Bagepalli,16/01/2016,5.0,Tomato,1200,2200,1800


In [4]:
raw_df.describe(include='all')

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
count,11456,11321,11321.0,11421,11456.0,11456.0,11456.0
unique,40,844,,2,679.0,934.0,1016.0
top,Kolar,16/08/2016,,Tomato,300.0,1000.0,500.0
freq,954,24,,11196,1179.0,939.0,675.0
mean,,,80.269146,,,,
std,,,154.967086,,,,
min,,,1.0,,,,
25%,,,4.0,,,,
50%,,,19.0,,,,
75%,,,70.0,,,,


***
# Data Clean

***
## - Mismatched Value

Replace value NR with 0

In [5]:
raw_replace_zero_df = raw_df.replace('NR', 0)

Ganti datatype untuk kolom Minimum Price(Rs./Quintal), Maximum Price(Rs./Quintal), dan Modal Price(Rs./Quintal) menjadi int64.

In [6]:
# raw_replace_zero_df[(raw_replace_zero_df['Minimum Price(Rs./Quintal)'] == 'NR') | (raw_replace_zero_df['Maximum Price(Rs./Quintal)'] == 'NR') | (raw_replace_zero_df['Modal Price(Rs./Quintal)'] == 'NR')]
raw_convert_datatype_df =raw_replace_zero_df.astype({'Minimum Price(Rs./Quintal)':'int64',
                                                     'Maximum Price(Rs./Quintal)':'int64',
                                                     'Modal Price(Rs./Quintal)':'int64'},
                                                   )

In [7]:
raw_convert_datatype_df.dtypes

Market                         object
Arrival Date                   object
Arrivals (Tonnes)             float64
Variety                        object
Minimum Price(Rs./Quintal)      int64
Maximum Price(Rs./Quintal)      int64
Modal Price(Rs./Quintal)        int64
dtype: object

In [8]:
raw_convert_datatype_df.isna().sum()

Market                          0
Arrival Date                  135
Arrivals (Tonnes)             135
Variety                        35
Minimum Price(Rs./Quintal)      0
Maximum Price(Rs./Quintal)      0
Modal Price(Rs./Quintal)        0
dtype: int64

***
## - Duplicate Value

Before

In [9]:
print('Number of duplicated data is', raw_convert_datatype_df[raw_convert_datatype_df.duplicated()].shape[0], 'rows')

Number of duplicated data is 76 rows


In [10]:
raw_no_duplicate_df = raw_convert_datatype_df[~raw_convert_datatype_df.duplicated()]

After

In [11]:
print('Number of duplicated data is',raw_no_duplicate_df[raw_no_duplicate_df.duplicated()].shape[0], 'rows')

Number of duplicated data is 0 rows


***
## - Missing Value

In [12]:
raw_no_duplicate_df.shape

(11380, 7)

In [13]:
raw_no_duplicate_df.isna().sum()

Market                         0
Arrival Date                  59
Arrivals (Tonnes)             59
Variety                       35
Minimum Price(Rs./Quintal)     0
Maximum Price(Rs./Quintal)     0
Modal Price(Rs./Quintal)       0
dtype: int64

In [14]:
raw_with_mv_df = raw_no_duplicate_df[(raw_no_duplicate_df['Arrival Date'].isna()) | (raw_no_duplicate_df['Arrivals (Tonnes)'].isna()) | (raw_no_duplicate_df['Variety'].isna())]

In [15]:
pd.set_option('max_rows', None)

In [16]:
raw_with_mv_df.dropna(subset=['Arrival Date', 'Arrivals (Tonnes)'])

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
9037,Bagepalli,21/11/2017,13.0,,0,0,0
9038,Bagepalli,22/11/2017,13.0,,0,0,0
9039,Bagepalli,23/11/2017,14.0,,0,0,0
9040,Bagepalli,25/11/2017,16.0,,0,0,0
9041,Bagepalli,27/11/2017,16.0,,0,0,0
9042,Bagepalli,28/11/2017,16.0,,0,0,0
9043,Bagepalli,29/11/2017,16.0,,0,0,0
9044,Bagepalli,30/11/2017,18.0,,0,0,0
9422,Bagepalli,2/12/2017,20.0,,0,0,0
9423,Bagepalli,4/12/2017,21.0,,0,0,0


In [17]:
raw_no_duplicate_df[raw_no_duplicate_df['Market'] == 'Hoskote'].sort_values(by='Arrival Date')

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
1218,Hoskote,13/04/2016,3.0,Hybrid,700,700,700
470,Hoskote,16/02/2016,5.0,Tomato,900,1000,950
165,Hoskote,18/01/2016,5.0,Hybrid,2000,2400,2300
824,Hoskote,18/03/2016,3.0,Hybrid,800,800,800
825,Hoskote,23/03/2016,3.0,Tomato,800,800,800
826,Hoskote,28/03/2016,3.0,Tomato,800,800,800
822,Hoskote,4/3/2016,5.0,Tomato,800,1000,900
1217,Hoskote,4/4/2016,3.0,Tomato,800,800,800
469,Hoskote,5/2/2016,5.0,Hybrid,2600,2800,2700
823,Hoskote,9/3/2016,6.0,Tomato,800,1000,900


In [18]:
raw_no_duplicate_df.shape

(11380, 7)

In [19]:
raw_no_duplicate_df.isna().sum().to_frame()

Unnamed: 0,0
Market,0
Arrival Date,59
Arrivals (Tonnes),59
Variety,35
Minimum Price(Rs./Quintal),0
Maximum Price(Rs./Quintal),0
Modal Price(Rs./Quintal),0


In [20]:
raw_no_duplicate_df[raw_no_duplicate_df['Market'] == 'Hoskote']

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
165,Hoskote,18/01/2016,5.0,Hybrid,2000,2400,2300
166,Hoskote,,,Tomato,2000,2400,2300
469,Hoskote,5/2/2016,5.0,Hybrid,2600,2800,2700
470,Hoskote,16/02/2016,5.0,Tomato,900,1000,950
822,Hoskote,4/3/2016,5.0,Tomato,800,1000,900
823,Hoskote,9/3/2016,6.0,Tomato,800,1000,900
824,Hoskote,18/03/2016,3.0,Hybrid,800,800,800
825,Hoskote,23/03/2016,3.0,Tomato,800,800,800
826,Hoskote,28/03/2016,3.0,Tomato,800,800,800
1217,Hoskote,4/4/2016,3.0,Tomato,800,800,800


In [21]:
raw_no_duplicate_df[raw_no_duplicate_df['Market'] == 'Kolar']

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
178,Kolar,1/1/2016,181.0,Tomato,834,4000,3000
179,Kolar,2/1/2016,185.0,Tomato,1000,4667,3334
180,Kolar,3/1/2016,110.0,Tomato,1000,4667,3453
181,Kolar,4/1/2016,185.0,Tomato,667,5000,4000
182,Kolar,5/1/2016,195.0,Tomato,1000,5334,4334
183,Kolar,6/1/2016,134.0,Tomato,1166,4534,3667
184,Kolar,8/1/2016,114.0,Tomato,1334,5334,4734
185,Kolar,9/1/2016,176.0,Tomato,1000,3334,2800
186,Kolar,12/1/2016,197.0,Tomato,1000,3000,2400
187,Kolar,13/01/2016,194.0,Tomato,1000,2867,2467


In [22]:
raw_no_duplicate_df[(raw_no_duplicate_df['Arrival Date'].isna()) | (raw_no_duplicate_df['Arrivals (Tonnes)'].isna())]

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
166,Hoskote,,,Tomato,2000,2400,2300
9681,Kolar,,,Tomato,400,1600,1000
9683,Kolar,,,Tomato,500,1167,834
9685,Kolar,,,Tomato,334,1000,667
9687,Kolar,,,Tomato,334,667,500
9691,Kolar,,,Tomato,300,1000,667
9693,Kolar,,,Tomato,300,634,467
9695,Kolar,,,Tomato,300,367,334
9699,Kolar,,,Tomato,334,734,534
9711,Kolar,,,Tomato,334,600,467


In [23]:
raw_no_duplicate_df.isna().sum()

Market                         0
Arrival Date                  59
Arrivals (Tonnes)             59
Variety                       35
Minimum Price(Rs./Quintal)     0
Maximum Price(Rs./Quintal)     0
Modal Price(Rs./Quintal)       0
dtype: int64

In [24]:
raw_no_mv_1_df = raw_no_duplicate_df.dropna(subset=['Arrival Date', 'Arrivals (Tonnes)'])

In [25]:
raw_no_mv_1_df.isna().sum()

Market                         0
Arrival Date                   0
Arrivals (Tonnes)              0
Variety                       35
Minimum Price(Rs./Quintal)     0
Maximum Price(Rs./Quintal)     0
Modal Price(Rs./Quintal)       0
dtype: int64

In [26]:
print('Number of missing value:', raw_no_mv_1_df[raw_no_mv_1_df['Variety'].isna()].shape[0])

Number of missing value: 35


In [27]:
raw_no_mv_1_df[raw_no_mv_1_df['Variety'].isna()]

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
9037,Bagepalli,21/11/2017,13.0,,0,0,0
9038,Bagepalli,22/11/2017,13.0,,0,0,0
9039,Bagepalli,23/11/2017,14.0,,0,0,0
9040,Bagepalli,25/11/2017,16.0,,0,0,0
9041,Bagepalli,27/11/2017,16.0,,0,0,0
9042,Bagepalli,28/11/2017,16.0,,0,0,0
9043,Bagepalli,29/11/2017,16.0,,0,0,0
9044,Bagepalli,30/11/2017,18.0,,0,0,0
9422,Bagepalli,2/12/2017,20.0,,0,0,0
9423,Bagepalli,4/12/2017,21.0,,0,0,0


In [28]:
raw_no_mv_df = raw_no_mv_1_df.fillna('Tomato')

In [29]:
raw_no_mv_df.shape

(11321, 7)

In [30]:
raw_no_mv_df.isna().sum()

Market                        0
Arrival Date                  0
Arrivals (Tonnes)             0
Variety                       0
Minimum Price(Rs./Quintal)    0
Maximum Price(Rs./Quintal)    0
Modal Price(Rs./Quintal)      0
dtype: int64

In [31]:
raw_no_mv_df[raw_no_mv_df['Market'] == 'Kolar']

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
178,Kolar,1/1/2016,181.0,Tomato,834,4000,3000
179,Kolar,2/1/2016,185.0,Tomato,1000,4667,3334
180,Kolar,3/1/2016,110.0,Tomato,1000,4667,3453
181,Kolar,4/1/2016,185.0,Tomato,667,5000,4000
182,Kolar,5/1/2016,195.0,Tomato,1000,5334,4334
183,Kolar,6/1/2016,134.0,Tomato,1166,4534,3667
184,Kolar,8/1/2016,114.0,Tomato,1334,5334,4734
185,Kolar,9/1/2016,176.0,Tomato,1000,3334,2800
186,Kolar,12/1/2016,197.0,Tomato,1000,3000,2400
187,Kolar,13/01/2016,194.0,Tomato,1000,2867,2467


In [98]:
raw_convert_datatype_df.isna().sum()

Market                          0
Arrival Date                  135
Arrivals (Tonnes)             135
Variety                        35
Minimum Price(Rs./Quintal)      0
Maximum Price(Rs./Quintal)      0
Modal Price(Rs./Quintal)        0
dtype: int64

In [35]:
# raw_no_mv.to_csv('../dataset/raw_no_mv.csv')

In [36]:
raw_no_mv_df.columns

Index(['Market', 'Arrival Date', 'Arrivals (Tonnes)', 'Variety',
       'Minimum Price(Rs./Quintal)', 'Maximum Price(Rs./Quintal)',
       'Modal Price(Rs./Quintal)'],
      dtype='object')

In [37]:
raw_no_mv_df[(raw_no_mv_df['Minimum Price(Rs./Quintal)'] == 0) |
             (raw_no_mv_df['Maximum Price(Rs./Quintal)'] == 0) |
             (raw_no_mv_df['Modal Price(Rs./Quintal)'] == 0)
            ]

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
9037,Bagepalli,21/11/2017,13.0,Tomato,0,0,0
9038,Bagepalli,22/11/2017,13.0,Tomato,0,0,0
9039,Bagepalli,23/11/2017,14.0,Tomato,0,0,0
9040,Bagepalli,25/11/2017,16.0,Tomato,0,0,0
9041,Bagepalli,27/11/2017,16.0,Tomato,0,0,0
9042,Bagepalli,28/11/2017,16.0,Tomato,0,0,0
9043,Bagepalli,29/11/2017,16.0,Tomato,0,0,0
9044,Bagepalli,30/11/2017,18.0,Tomato,0,0,0
9422,Bagepalli,2/12/2017,20.0,Tomato,0,0,0
9423,Bagepalli,4/12/2017,21.0,Tomato,0,0,0


In [38]:
round(raw_no_mv_df.describe())

Unnamed: 0,Arrivals (Tonnes),Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
count,11321.0,11321.0,11321.0,11321.0
mean,80.0,863.0,1610.0,1274.0
std,155.0,925.0,1386.0,1126.0
min,1.0,0.0,0.0,0.0
25%,4.0,300.0,600.0,500.0
50%,19.0,500.0,1000.0,900.0
75%,70.0,1000.0,2134.0,1734.0
max,1931.0,9000.0,10000.0,9000.0


In [39]:
raw_no_mv_df['Minimum Price(Rs./Quintal)'].replace(0, 863, inplace=True)

In [40]:
raw_no_mv_df['Maximum Price(Rs./Quintal)'].replace(0, 1610, inplace=True)

In [41]:
raw_no_mv_df['Modal Price(Rs./Quintal)'].replace(0, 1274, inplace=True)

In [42]:
raw_no_mv_df[(raw_no_mv_df['Minimum Price(Rs./Quintal)'] == 0) |
             (raw_no_mv_df['Maximum Price(Rs./Quintal)'] == 0) |
             (raw_no_mv_df['Modal Price(Rs./Quintal)'] == 0)
            ]

Unnamed: 0,Market,Arrival Date,Arrivals (Tonnes),Variety,Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)


In [43]:
raw_no_mv_df.describe()

Unnamed: 0,Arrivals (Tonnes),Minimum Price(Rs./Quintal),Maximum Price(Rs./Quintal),Modal Price(Rs./Quintal)
count,11321.0,11321.0,11321.0,11321.0
mean,80.269146,865.678385,1614.712481,1277.489356
std,154.967086,924.192722,1383.453997,1123.668165
min,1.0,22.0,100.0,70.0
25%,4.0,300.0,600.0,500.0
50%,19.0,500.0,1000.0,900.0
75%,70.0,1000.0,2134.0,1734.0
max,1931.0,9000.0,10000.0,9000.0


***
## - Outlier

***
### Create function

In [44]:
def determine_lower_upper_outliers(column_name):
    quartile_1          = column_name.quantile(0.25)
    quartile_3          = column_name.quantile(0.75)
    interquartilerange  = quartile_3 - quartile_1
    lower_bound_outlier = quartile_1 - 1.5 * interquartilerange if quartile_1 - 1.5 * interquartilerange >= 0 else 0
    upper_bound_outlier = quartile_3 + 1.5 * interquartilerange

    return [lower_bound_outlier, upper_bound_outlier]

def print_limit_outlier(arr_lower_upper):
    print("Lower bound outlier:", arr_lower_upper[0])
    print("Upper bound outlier:", arr_lower_upper[1])

In [45]:
raw_final_df = pd.read_csv('../dataset/raw_no_mv_dataset.csv')

***
### - arrival_tonnes

In [47]:
print_limit_outlier(determine_lower_upper_outliers(raw_final_df['arrivals_tonnes']))

Lower bound outlier: 0
Upper bound outlier: 169.0


In [70]:
print('Number of row with no outlier in feature arrival_tonnes:', raw_final_df[raw_final_df['arrivals_tonnes'] <= 169.0].shape[0])

Number of row with no outlier in feature arrival_tonnes: 9470


In [71]:
raw_final_df.columns

Index(['market', 'arrival_date', 'arrivals_tonnes', 'variety', 'minimum_price',
       'maximum_price', 'modal_price'],
      dtype='object')

### Histogram

In [82]:
fig = px.histogram(raw_final_df, x="arrivals_tonnes")

fig.update_layout(
    title   = 'Histogram with Outlier',
    title_x = 0.5
)

fig.show()

In [83]:
arrivals_tonnes = raw_final_df[raw_final_df['arrivals_tonnes'] <= 169.0]
fig = px.histogram(arrivals_tonnes, x="arrivals_tonnes")

fig.update_layout(
    title   = 'Histogram with No Outlier',
    title_x = 0.5
)

fig.show()

***
### - minimum_price

In [84]:
print_limit_outlier(determine_lower_upper_outliers(raw_final_df['minimum_price']))

Lower bound outlier: 0
Upper bound outlier: 2050.0


In [85]:
print('Number of row with no outlier in feature minimum_price:', raw_final_df[raw_final_df['minimum_price'] <= 2050.0].shape[0])

Number of row with no outlier in feature minimum_price: 10338


### Histogram

In [88]:
fig = px.histogram(raw_final_df, x="minimum_price")

fig.update_layout(
    title   = 'Histogram with Outlier',
    title_x = 0.5
)

fig.show()

In [89]:
arrivals_tonnes = raw_final_df[raw_final_df['minimum_price'] <= 2050.0]
fig = px.histogram(arrivals_tonnes, x="minimum_price")

fig.update_layout(
    title   = 'Histogram with No Outlier',
    title_x = 0.5
)

fig.show()

***
### - maximum_price 

In [90]:
print_limit_outlier(determine_lower_upper_outliers(raw_final_df['maximum_price']))

Lower bound outlier: 0
Upper bound outlier: 4435.0


In [91]:
print('Number of row with no outlier in feature minimum_price:', raw_final_df[raw_final_df['maximum_price'] <= 4435.0].shape[0])

Number of row with no outlier in feature minimum_price: 10694


### Histogram

In [92]:
fig = px.histogram(raw_final_df, x="maximum_price")

fig.update_layout(
    title   = 'Histogram with Outlier',
    title_x = 0.5
)

fig.show()

In [93]:
arrivals_tonnes = raw_final_df[raw_final_df['maximum_price'] <= 4435.0]
fig = px.histogram(arrivals_tonnes, x="maximum_price")

fig.update_layout(
    title   = 'Histogram with No Outlier',
    title_x = 0.5
)

fig.show()

***
### - modal_price

In [94]:
print_limit_outlier(determine_lower_upper_outliers(raw_final_df['modal_price']))

Lower bound outlier: 0
Upper bound outlier: 3585.0


In [95]:
print('Number of row with no outlier in feature modal_price:', raw_final_df[raw_final_df['minimum_price'] <= 3585.0].shape[0])

Number of row with no outlier in feature modal_price: 11004


### Histogram

In [96]:
fig = px.histogram(raw_final_df, x="minimum_price")

fig.update_layout(
    title   = 'Histogram with Outlier',
    title_x = 0.5
)

fig.show()

In [97]:
arrivals_tonnes = raw_final_df[raw_final_df['modal_price'] <= 3585.0]
fig = px.histogram(arrivals_tonnes, x="modal_price")

fig.update_layout(
    title   = 'Histogram with No Outlier',
    title_x = 0.5
)

fig.show()

***
# Scatterplot all feature with outlier

In [193]:
raw_final_df.describe()

Unnamed: 0,arrivals_tonnes,minimum_price,maximum_price,modal_price
count,11321.0,11321.0,11321.0,11321.0
mean,80.269146,865.678385,1614.712481,1277.489356
std,154.967086,924.192722,1383.453997,1123.668165
min,1.0,22.0,100.0,70.0
25%,4.0,300.0,600.0,500.0
50%,19.0,500.0,1000.0,900.0
75%,70.0,1000.0,2134.0,1734.0
max,1931.0,9000.0,10000.0,9000.0


In [189]:
fig = px.scatter_matrix(raw_final_df,
                        dimensions = ["arrivals_tonnes", "minimum_price", "maximum_price", "modal_price"],
                        color      = "variety",
                        opacity    =0.4)

fig.update_layout(
    height = 1000
)

fig.show()

***
# Scatterplot with no outlier

### - with operator OR

In [180]:
raw_no_outlier_operator_or_df = raw_final_df[(raw_final_df['arrivals_tonnes'] <= 169.0) |
                                 (raw_final_df['minimum_price'] <= 2050.0) |
                                 (raw_final_df['maximum_price'] <= 4435.0) |
                                 (raw_final_df['modal_price'] <= 3585.0)]

In [177]:
raw_no_outlier_operator_or_df.shape[0]

11287

In [192]:
raw_no_outlier_operator_or_df.describe()

Unnamed: 0,arrivals_tonnes,minimum_price,maximum_price,modal_price
count,11287.0,11287.0,11287.0,11287.0
mean,79.790998,854.233632,1603.318862,1266.010189
std,154.927848,900.234418,1368.834692,1104.679283
min,1.0,22.0,100.0,70.0
25%,4.0,300.0,600.0,500.0
50%,19.0,500.0,1000.0,900.0
75%,68.0,1000.0,2100.0,1700.0
max,1931.0,6885.0,9200.0,7000.0


Save dataframe in format .csv

In [182]:
# raw_no_outlier_operator_or_df.to_csv('../dataset/raw_no_outlier_operator_or_dataset.csv', index=False)

In [188]:
fig = px.scatter_matrix(raw_no_outlier_operator_or_df,
                        dimensions = ["arrivals_tonnes", "minimum_price", "maximum_price", "modal_price"],
                        color      = "variety",
                        opacity    = 0.4)

fig.update_layout(
    height = 1000
)

fig.show()

### - With operator AND

In [184]:
raw_no_outlier_operator_and_df = raw_final_df[(raw_final_df['arrivals_tonnes'] <= 169.0) &
                                 (raw_final_df['minimum_price'] <= 2050.0) &
                                 (raw_final_df['maximum_price'] <= 4435.0) &
                                 (raw_final_df['modal_price'] <= 3585.0)]

In [185]:
raw_no_outlier_operator_and_df.shape[0]

8398

In [191]:
raw_no_outlier_operator_and_df.describe()

Unnamed: 0,arrivals_tonnes,minimum_price,maximum_price,modal_price
count,8398.0,8398.0,8398.0,8398.0
mean,27.313289,628.509526,1233.861515,949.960705
std,34.888411,461.638557,892.569233,655.709839
min,1.0,22.0,100.0,70.0
25%,3.0,300.0,600.0,450.0
50%,13.0,466.0,1000.0,750.0
75%,38.0,858.75,1600.0,1240.75
max,169.0,2042.0,4400.0,3534.0


Save dataframe in format .csv

In [186]:
# raw_no_outlier_operator_and_df.to_csv('../dataset/raw_no_outlier_operator_and_dataset.csv', index=False)

In [187]:
fig = px.scatter_matrix(raw_no_outlier_operator_and_df,
                        dimensions = ["arrivals_tonnes", "minimum_price", "maximum_price", "modal_price"],
                        color      = "variety",
                        opacity    = 0.4)

fig.update_layout(
    height = 1000
)

fig.show()