# Example: North American pumpkin prices

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from datetime import datetime

## Visualize and clean data in preparation for ML

In [None]:
pumpkins = pd.read_csv('data/US-pumpkins.csv')
pumpkins.head()

In [None]:
pumpkins.info()

In [None]:
pumpkins["Date"] = pd.to_datetime(pumpkins["Date"])

In [None]:
pumpkins["Month"] = pumpkins["Date"].dt.month

In [None]:
pumpkins["DayOfYear"] = pd.to_datetime(pumpkins['Date']).apply(lambda dt: (dt-datetime(dt.year,1,1)).days)

In [None]:
pumpkins.isnull().sum()

In [None]:
new_columns = ['Package', 'Variety', 'City Name', 'Month', 'Low Price', 'High Price', 'DayOfYear']
pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
pumpkins.head()

In [None]:
pumpkins['Price'] = (pumpkins['Low Price'] + pumpkins['High Price']) / 2
pumpkins.head()

In [None]:
pumpkins["Package"].unique()

In [None]:
new_pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]

In [None]:
new_pumpkins.shape

In [None]:
new_pumpkins["Package"].unique()

In [None]:
new_pumpkins.head()

In [None]:
new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = new_pumpkins['Price']/(1 + 1/9)
new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = new_pumpkins['Price']/(1/2)

In [None]:
new_pumpkins.head()

**Visualization Strategies**

In [None]:
new_pumpkins.groupby(['Month'])['Price'].mean().plot(kind='bar')
plt.ylabel("Pumpkin Price")
plt.show()

**Looking for Correlation**

In [None]:
new_pumpkins.plot.scatter('DayOfYear','Price')
plt.show()

In [None]:
print(new_pumpkins['Month'].corr(new_pumpkins['Price']))
print(new_pumpkins['DayOfYear'].corr(new_pumpkins['Price']))

In [None]:
ax=None
colors = ['red','blue','green','yellow']
for i,var in enumerate(new_pumpkins['Variety'].unique()):
    ax = new_pumpkins[new_pumpkins['Variety']==var].plot.scatter('DayOfYear','Price',ax=ax,c=colors[i],label=var)

In [None]:
new_pumpkins.groupby('Variety')['Price'].mean().plot(kind='bar')
plt.show()

In [None]:
pie_pumpkins = new_pumpkins[new_pumpkins['Variety']=='PIE TYPE'].copy()
pie_pumpkins.plot.scatter('DayOfYear','Price') 
plt.show()

In [None]:
print(pie_pumpkins['DayOfYear'].corr(pie_pumpkins['Price']))

----

## Build a regression model using Scikit-learn

In [None]:
pie_pumpkins.dropna(inplace=True)
pie_pumpkins.info()

**Simple Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
X = pie_pumpkins['DayOfYear'].to_numpy().reshape(-1,1)
y = pie_pumpkins['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

In [None]:
from sklearn.metrics import mean_squared_error

pred = lin_reg.predict(X_test)

mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3}')

In [None]:
score = lin_reg.score(X_train,y_train)
print('Model determination: ', score)

In [None]:
plt.scatter(X_test,y_test)
plt.plot(X_test,pred)
plt.show()

**Polynomial Regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(PolynomialFeatures(2, include_bias=False), LinearRegression())

pipeline.fit(X_train,y_train)

pred = pipeline.predict(X_test)

In [None]:
plt.scatter(X_test,y_test)
plt.plot(sorted(X_test),pipeline.predict(sorted(X_test)))
plt.show()

In [None]:
mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3}')

score = pipeline.score(X_train,y_train)
print('Model determination: ', score)

**Categorical Features**

In [None]:
pd.get_dummies(new_pumpkins['Variety'])

**Linear Regression on Variety**

In [None]:
X = pd.get_dummies(new_pumpkins['Variety'])
y = new_pumpkins['Price']

In [None]:
def run_linear_regression(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    lin_reg = LinearRegression()
    lin_reg.fit(X_train,y_train)

    pred = lin_reg.predict(X_test)

    mse = np.sqrt(mean_squared_error(y_test,pred))
    print(f'Mean error: {mse:3.3}')

    score = lin_reg.score(X_train,y_train)
    print('Model determination: ', score)

run_linear_regression(X,y)

In [None]:
X = pd.get_dummies(new_pumpkins['Variety']) \
        .join(new_pumpkins['Month']) \
        .join(pd.get_dummies(new_pumpkins['City Name'])) \
        .join(pd.get_dummies(new_pumpkins['Package']))

y = new_pumpkins['Price']

run_linear_regression(X,y)

**Polynomial Regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(PolynomialFeatures(2), LinearRegression())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train,y_train)

pred = pipeline.predict(X_test)

mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3}')

score = pipeline.score(X_train,y_train)
print('Model determination: ', score)

---

## Logistic regression to predict categories

**Tidy the data**

In [None]:
pumpkins = pd.read_csv('./data/US-pumpkins.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder

new_columns = ['Color','Origin','Item Size','Variety','City Name','Package']
new_pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
new_pumpkins.dropna(inplace=True)
new_pumpkins = new_pumpkins.apply(LabelEncoder().fit_transform)

new_pumpkins.head()

In [None]:
new_pumpkins.info()

**Build your model**

In [None]:
from sklearn.model_selection import train_test_split

Selected_features = ['Origin','Item Size','Variety','City Name','Package']

X = new_pumpkins[Selected_features]
y = new_pumpkins['Color']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('Accuracy: ', accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predictions)