In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# source: https://download.bls.gov/pub/time.series/ap/
# save ap.data.3.food as a txt file
apfood = pd.read_csv('ap.data.3.food.txt', sep = '\t')
apfood.value = pd.to_numeric(apfood.value, errors='coerce')  # converts the value column to numeric

In [None]:
apfood.series_id = apfood.series_id.str.strip() # remove whitespace from series_id

In [None]:
# Read section 4 of https://download.bls.gov/pub/time.series/ap/ap.txt
# for how to understand the series_id
# We extract the itemcode from the series_id and create a new column called 'item_code'
apfood['item_code'] = apfood.series_id.str[-6:]

In [None]:
apfood.info()

In [None]:
print(apfood.head())

In [None]:
# source: https://download.bls.gov/pub/time.series/ap/
# save ap.item as a txt file
apitem = pd.read_csv('ap.item.txt', sep = '\t')

In [None]:
print(apitem.head())

In [None]:
food = pd.merge(apfood, apitem, on = 'item_code')

In [None]:
food.info()

In [None]:
print(food.head())

# Start here with your answer to question 1:

## Part 1 – Extract Variables from Series ID

1.	Extract the area code from the Series ID. It is four digits. Store it as a string. Insert the series as a new column called ‘area’ in the DataFrame.

In [None]:
area_code = [x[3:7] for x in food.series_id]
food['area'] = area_code
food.head()

2.	Print out the unique area code values that appear. [5 pts]

In [None]:
food.area.unique()

3.	Extract the seasonal code from the Series ID. It is one character. Store as string. Insert the series as a new column called ‘season’ in the DataFrame.

In [None]:
food['season'] = [x[2] for x in food.series_id]
food.head()

4.	Print out the unique season code values that appear. [5 pts]

In [None]:
food.season.unique()

## Part 2 – Fruit Price comparison: Apples, Bananas, and Oranges

### Guiding question: How do the prices of apples, bananas, and oranges compare to each other? Have the prices changed over the years? Do the prices change within each year?

1.	Filter the data:

    - Use item codes: Apples 711111, Bananas 711211, Oranges 711311
    - Use area code = ‘0000’ (US City avg only)
    - Filter to data from January 2000 and later
    - Print the shape of the resulting filtered data [10 pts]

In [None]:
a = food.item_code.isin(['711111', '711211', '711311'])
b = (food.area == '0000')
c = (food.year >= 2000)

In [None]:
df2 = food.loc[a&b&c,:]
df2.shape

2.	Change the index to be dates of the first of each month

    - For example, the index for 2000, M01 should be 2000-01-01
    - The index for 2019, M05, should be 2019-05-01
    - Print the head of the resulting DataFrame [10 pts]

In [None]:
df2.set_index(df2.loc[:,['year', 'period']].apply(
    lambda x: pd.datetime(
        year=int(x[0]), 
        month=int(x[1][1:]),
        day=1), axis=1), inplace=True)

In [None]:
df2.head()

3.	Create a plot showing three lines: one for each fruit versus the month (with appropriate labels, tick marks, and legends) [10 pts]


In [None]:
plt.plot('index', 'value', label='Apple',data=df2.loc[df2.item_code == '711111',:].reset_index())
plt.plot('index', 'value', label='Banana', data=df2.loc[df2.item_code == '711211',:].reset_index())
plt.plot('index', 'value', label='Orange', data=df2.loc[df2.item_code == '711311',:].reset_index())
plt.legend()
plt.title('Average price')
plt.ylabel('Cost per Pound $')
plt.show()

4.	Let’s say we want to just look at overall trends by looking at the average price over the year.

    - Calculate the average price of the fruit for each year. For example, find the mean price of apples during the year 2000, during the year 2001, etc.
    - Print your results. Should be a DataFrame with 20 rows (one for each year) and 3 columns (one for each fruit). Partial points for separate series or DataFrames. [10 pts]

In [None]:
df_mean = df2.groupby(['year','item_code']).value.mean().unstack().rename(
    columns={'711111':'Apple',
            '711211':'Banana',
            '711311':'Orange'})
df_mean.columns.name = 'item'
df_mean

- Create plot showing the annual cost of the fruit (with appropriate labels, tick marks plt.xticks, and legends) [10 pts]


In [None]:
plt.margins(x=0)
plt.xticks(np.linspace(2000,2018,10))
plt.plot('year', 'Apple',  data=df_mean.reset_index())
plt.plot('year', 'Banana', data=df_mean.reset_index())
plt.plot('year', 'Orange', data=df_mean.reset_index())
plt.legend()
plt.xlabel('year')
plt.title('Annual Average Price')
plt.ylabel('Cost per Pound $')
plt.show()

5.	When we look at the monthly prices, we see that oranges seem to exhibit seasonal prices. Let’s explore this further

    - Group the data by month and calculate the average price for each fruit for that month
i.	For example, find the average price of oranges in the month of January (this value is calculated by looking at , and the average price of oranges in the month of February, etc.
    - Print your results (should be a 12 x 3 DataFrame). Partial points for separate series or DataFrames [10 pts]

In [None]:
df_mean_month = df2.groupby(['period','item_code']).value.mean().unstack().rename(
    columns={'711111':'Apple',
            '711211':'Banana',
            '711311':'Orange'})
df_mean_month.columns.name = 'item'
df_mean_month

- Create a plot (with appropriate labels, tick marks, and legends). [10 pts]


In [None]:
plt.margins(x=0)
plt.plot('period', 'Apple',  data=df_mean_month.reset_index())
plt.plot('period', 'Banana', data=df_mean_month.reset_index())
plt.plot('period', 'Orange', data=df_mean_month.reset_index())
plt.legend()
plt.xticks(np.linspace(0,11,12), np.arange(1,13))
plt.xlabel('Month')
plt.title('Average monthly cost for years 2000-2018')
plt.ylabel('Cost per Pound $')
plt.show()

6.	Let’s find months where there was an unusual spike in the cost of a fruit.

    - Read the help on the function pd.DataFrame.diff()
    - Find and print the mean price change between months [5 pts]


In [None]:
df3 = df2.pivot(columns='item_code', values='value').rename(columns={'711111':'Apple',
            '711211':'Banana',
            '711311':'Orange'})
df3.head(20)

In [None]:
month_price_change = df3.diff()
print(month_price_change.head())
means = month_price_change.mean(axis=0)
means

- Find and print the standard deviation of the price changes between month [5 pts]

In [None]:
stds = month_price_change.std(axis=0)
stds

- Identify unusual price spikes: ones that are more than 2 SD above the mean monthly price change
- For example, there’s a month where the cost of bananas jumped 5.7 cents, which was unusual. Identify the other ones. [10 pts]

In [None]:
month_price_change.loc[month_price_change.Apple > means.Apple + 2*stds.Apple, :]

In [None]:
month_price_change.loc[month_price_change.Banana > means.Banana + 2*stds.Banana, :]

In [None]:
month_price_change.loc[month_price_change.Orange > means.Orange + 2*stds.Orange, :]

7.	0 to 5 points bonus for answers:

    - Comment on the cost relationship between apples, bananas, and oranges.
    - Just saying bananas are cheaper than apples or oranges will not earn you any bonus points.

8.	0 to 5 points bonus for answers:

    - Explore differences in cost of fruit between regions.

In [None]:
food.set_index(food.loc[:,['year', 'period']].apply(
    lambda x: pd.datetime(
        year=int(x[0]), 
        month=int(x[1][1:]),
        day=1), axis=1), inplace=True)