In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

### Load data

In [4]:
dataframe = pd.read_csv('US-pumpkins.csv')
dataframe.head()

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Grade,Date,Low Price,High Price,Mostly Low,...,Unit of Sale,Quality,Condition,Appearance,Storage,Crop,Repack,Trans Mode,Unnamed: 24,Unnamed: 25
0,BALTIMORE,,24 inch bins,,,,4/29/17,270.0,280.0,270.0,...,,,,,,,E,,,
1,BALTIMORE,,24 inch bins,,,,5/6/17,270.0,280.0,270.0,...,,,,,,,E,,,
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,11/5/16,90.0,100.0,90.0,...,,,,,,,N,,,


### Know your data

In [5]:
dataframe.shape

(1757, 26)

In [7]:
dataframe.isnull().sum()

City Name             0
Type               1712
Package               0
Variety               5
Sub Variety        1461
Grade              1757
Date                  0
Low Price             0
High Price            0
Mostly Low          103
Mostly High         103
Origin                3
Origin District    1626
Item Size           279
Color               616
Environment        1757
Unit of Sale       1595
Quality            1757
Condition          1757
Appearance         1757
Storage            1757
Crop               1757
Repack                0
Trans Mode         1757
Unnamed: 24        1757
Unnamed: 25        1654
dtype: int64

### Drop columns

In [8]:
columns = ['Package', 'Low Price', 'High Price', 'Repack', 'Date', 'Variety', 'City Name']
dropcols = [c for c in dataframe.columns if c not in columns]

In [9]:
pumpkins_data = dataframe.drop(dropcols, axis=1)

In [11]:
pumpkins_data.head()

Unnamed: 0,City Name,Package,Variety,Date,Low Price,High Price,Repack
0,BALTIMORE,24 inch bins,,4/29/17,270.0,280.0,E
1,BALTIMORE,24 inch bins,,5/6/17,270.0,280.0,E
2,BALTIMORE,24 inch bins,HOWDEN TYPE,9/24/16,160.0,160.0,N
3,BALTIMORE,24 inch bins,HOWDEN TYPE,9/24/16,160.0,160.0,N
4,BALTIMORE,24 inch bins,HOWDEN TYPE,11/5/16,90.0,100.0,N


#### Convert month string into number

In [13]:
month = pd.DatetimeIndex(pumpkins_data['Date']).month
month

Int64Index([ 4,  5,  9,  9, 11, 11,  9,  9, 10, 10,
            ...
             9,  9,  9,  9,  9,  9,  9,  9,  9,  9],
           dtype='int64', name='Date', length=1757)

#### Calculate price by averaging low and high price

In [14]:
price = (pumpkins_data['Low Price'] + pumpkins_data['High Price'])/2

In [15]:
pumpkins_data['Month'] = month
pumpkins_data['Price'] = price

#### Adjust price based on package type

In [16]:
pumpkins_data.loc[pumpkins_data['Package'].str.contains('1 1/9'), 'Price'] = price/(1 + 1/9)

pumpkins_data.loc[pumpkins_data['Package'].str.contains('1/2'), 'Price'] = price/(1/2)

In [18]:
pumpkins_data.head()

Unnamed: 0,City Name,Package,Variety,Date,Low Price,High Price,Repack,Month,Price
0,BALTIMORE,24 inch bins,,4/29/17,270.0,280.0,E,4,275.0
1,BALTIMORE,24 inch bins,,5/6/17,270.0,280.0,E,5,275.0
2,BALTIMORE,24 inch bins,HOWDEN TYPE,9/24/16,160.0,160.0,N,9,160.0
3,BALTIMORE,24 inch bins,HOWDEN TYPE,9/24/16,160.0,160.0,N,9,160.0
4,BALTIMORE,24 inch bins,HOWDEN TYPE,11/5/16,90.0,100.0,N,11,95.0


#### Find total packages types in data

In [20]:
pumpkins_data['Package'].unique()

array(['24 inch bins', '36 inch bins', '50 lb sacks',
       '1 1/9 bushel cartons', '1/2 bushel cartons',
       '1 1/9 bushel crates', 'bushel cartons', 'bins', '35 lb cartons',
       'each', '20 lb cartons', '50 lb cartons', '40 lb cartons',
       'bushel baskets', '22 lb cartons'], dtype=object)

#### Keep only bushel package data

In [21]:
pumpkins_data = pumpkins_data[pumpkins_data['Package'].str.contains('bushel', case=True)]

In [22]:
pumpkins_data

Unnamed: 0,City Name,Package,Variety,Date,Low Price,High Price,Repack,Month,Price
70,BALTIMORE,1 1/9 bushel cartons,PIE TYPE,9/24/16,15.00,15.0,N,9,13.50
71,BALTIMORE,1 1/9 bushel cartons,PIE TYPE,9/24/16,18.00,18.0,N,9,16.20
72,BALTIMORE,1 1/9 bushel cartons,PIE TYPE,10/1/16,18.00,18.0,N,10,16.20
73,BALTIMORE,1 1/9 bushel cartons,PIE TYPE,10/1/16,17.00,17.0,N,10,15.30
74,BALTIMORE,1 1/9 bushel cartons,PIE TYPE,10/8/16,15.00,15.0,N,10,13.50
...,...,...,...,...,...,...,...,...,...
1738,ST. LOUIS,1/2 bushel cartons,MINIATURE,9/30/16,15.00,15.0,N,9,30.00
1739,ST. LOUIS,1/2 bushel cartons,MINIATURE,9/30/16,13.75,15.0,N,9,28.75
1740,ST. LOUIS,1/2 bushel cartons,MINIATURE,9/30/16,10.75,15.0,N,9,25.75
1741,ST. LOUIS,1/2 bushel cartons,MINIATURE,9/30/16,12.00,12.0,N,9,24.00
