In [None]:
# 0. Run this code to import the libraries we will need.
import pandas as pd #data analysis and manipulation tool
import numpy as np #mathematical functions
import matplotlib.pyplot as plt #collection of functions creates a figure, creates a plotting area
import seaborn as sns #data visualization library based on matplotlib
import os # provides functions for interacting with the operating system
import geopy #locate the coordinates of addresses, cities, countries, and landmarks across the globe
import io

"""
In this section, we will review:
- selecting columns
- filtering rows by characteristic
- creating a new variable
- creating a boxplot, line plot, and heat map in seaborn

And we will learn something new about:
- syntax for converting from wide to long format
- dropping columns
- combining multiple commands on a DataFrame
- what a pandas Index is
- how to set a DataFrame index

Be sure that you have downloaded ice_cream_by_shop.csv before starting.
"""

#Run this cell to load the file into a data frame and look at that data frame
ic_wide = pd.read_csv('./sample_data/ice_cream_by_shop.csv') #upload data
print(ic_wide)

# Create a table of only the ID number, address, Wednesday, and Thursday ice cream
# sales from the data.
ic_subset = ic_wide[["ID","address","Wednesday","Thursday"]]
print(ic_subset)

In [None]:
## Converting to long format and viewing distributions

ic_long = ic_wide.melt(id_vars=['ID','address'],value_name='count',var_name='day')
print (ic_long)

In [None]:
# Create a boxplot for the count of ice creams purchased by day of the week using
# the long data frame.
plt.figure(figsize = (15,8))
ax = sns.boxplot(data=ic_long, x="day", y="count")
ax.set_xlabel("Day of week")
ax.set_ylabel("Count")
ax.set_title("Count of ice cream purchased by day of the week")

plt.show()

In [None]:
# Using the ic_long dataframe, create a time-series plot that shows the amount
# of ice cream sold each day of the week.

temp = ic_long.groupby('ID')
print(type(temp))
for key, item in temp:
  print(temp.get_group(key))

plt.figure(figsize = (15,8))
for label, grp in ic_long.groupby('ID'):
    ax = sns.lineplot(x='day', y='count', data=grp, marker='o', label=label)

ax.set_xlabel("Day of week")
ax.set_ylabel("Count")
ax.set_title("Count of ice cream purchased by day of the week")
plt.legend(title='Store ID', ncol=2, loc='best')
plt.show()

In [None]:
plt.figure(figsize = (15,8))
for label, grp in ic_long.groupby('ID'):
    ax = sns.lineplot(x='day', y='count', data=grp, marker='o', label=label, linewidth=3)

ax.set_xlabel("Day of week")
ax.set_ylabel("Count")
ax.set_title("Count of ice cream purchased by day of the week")
ax.legend(title='Store ID', bbox_to_anchor=(1,1)) #bbox_to_anchor() argument to place a seaborn legend outside of the plot.
plt.show()

In [None]:
#plot a heat map of ice cream sales for these stores.
#Won't work. Heat Map is expected number value not string type
#plt.figure(figsize = (15,8))
#sns.heatmap(data=ic_wide)
#plt.show()

In [None]:
ic_wide.drop(columns='address')
print (ic_wide.drop(columns='address'))

print(ic_wide)
plt.figure(figsize = (15,8))
sns.heatmap(data=ic_wide.drop(columns='address'))
plt.show()

In [None]:

ic_wide.set_index('ID')
print(ic_wide.set_index('ID'))

ic_reformatted = ic_wide.drop(columns='address').set_index('ID')
print(ic_reformatted)

plt.figure(figsize = (15,8))

ax = sns.heatmap(data=ic_reformatted)
ax.set_xlabel("Day of week")
ax.set_ylabel("Store ID")
ax.set_title("Count of ice cream purchased by day of the week, WNG")
plt.show()

In [None]:
print(ic_wide)
#this is an example of list comprehension.
days_columns = [col for col in ic_wide.columns if col.endswith('day')]
print(days_columns)


In [None]:
# explain: col for col in ic_wide.columns if col.endswith('day')
print(ic_wide.columns)
days_columns2 = []
for col in ic_wide.columns:
  if col.find('es') != -1:
    print(f'{col} - yes')
  else:
    print(f'{col} - no')

print(days_columns2)

In [None]:

# Run this code to get the data frame subset that is only these days.
print(ic_wide[days_columns])

#The sum of all the ice creams for each shop
print(ic_wide[days_columns].sum())

print(ic_wide[days_columns].sum(axis=1)) #axis tells pandas whether to do operations by row i.e. each index 0-5



In [None]:
ic_wide["total_per_shop"] = ic_wide[days_columns].sum(axis=1)
print(ic_wide)

In [None]:
# Let's try out the geopy library!
# Run this code to look up the given address and see details.
geolocator = geopy.Nominatim(user_agent="example")
location = geolocator.geocode("201 1st St 94022")
location

In [None]:
# Run this to see even more details.
print(location.raw)

# We can extract the latitude like this.
print(location.latitude)

# We can extract the Longitude like this.
print(location.longitude)

In [None]:
location = geolocator.geocode("4301 University Way NE 98105")
location

print(location.raw)

In [None]:
#Run this to get the latitude for each address. It will store this information in
#the data frame. (ignore the SettingWithCopyWarning, if it appears)

def getLatitude(address,geolocator):
    location = geolocator.geocode(address, timeout=10000)
    return location.latitude

def getLongitude(address,geolocator):
    location = geolocator.geocode(address, timeout=10000)
    return location.longitude

ic_wide['latitudes'] = ic_wide['address'].apply(getLatitude,args=(geolocator,))
ic_wide['longitude'] = ic_wide['address'].apply(getLongitude,args=(geolocator,))
ic_wide

In [None]:
# Using Seaborn, create a scatterplot for the total_per_shop vs the latitude.
# Give this plot better axis labels and a title.
plt.figure(figsize = (15,8))
ax = sns.scatterplot(data = ic_wide, x = "total_per_shop", y = "latitudes")
ax.set_ylabel("Store latitudes ")
ax.set_xlabel("Number of ice cream")
ax.set_title("Weekly total ice cream sold per store location")
plt.show()