In [None]:
# 0. Run this code to import the libraries we will need.
import pandas as pd #data analysis and manipulation tool
import numpy as np #mathematical functions
import matplotlib.pyplot as plt #collection of functions creates a figure, creates a plotting area
import seaborn as sns #data visualization library based on matplotlib
import os # provides functions for interacting with the operating system
import geopy #locate the coordinates of addresses, cities, countries, and landmarks across the globe
import io

In [None]:

"""
In this section, we will review:
- selecting columns
- filtering rows by characteristic
- creating a new variable
- creating a boxplot, line plot, and heat map in seaborn

And we will learn something new about:
- syntax for converting from wide to long format
- dropping columns
- combining multiple commands on a DataFrame
- what a pandas Index is
- how to set a DataFrame index

Be sure that you have downloaded ice_cream_by_shop.csv before starting.
"""

#Run this cell to load the file into a data frame and look at that data frame
ic_wide = pd.read_csv('./ice_cream_by_shop.csv') #upload data
print(ic_wide)

# Create a table of only the ID number, address, Wednesday, and Thursday ice cream
# sales from the data.
ic_subset = ic_wide[["ID","address","Wednesday","Thursday"]]
print(ic_subset)

In [None]:
## Converting to long format and viewing distributions
## Pandas melt() function is used to change the DataFrame format from wide to long.
## This is useful when you want to unpivot a DataFrame, meaning you want to convert multiple columns into a single column containing the variable names,
## and another column containing their corresponding values.

### id_vars: Column(s) to use as identifier variables.
### value_name: Name to use for the ‘value’ column.
### var_name[scalar]: Name to use for the ‘variable’ column. If None it uses frame.columns.name or ‘variable’.

ic_long = ic_wide.melt(id_vars=['ID','address'], value_name='count', var_name='day')
print (ic_long)

In [None]:
# Create a boxplot for the count of ice creams purchased by day of the week using
# the long data frame.
plt.figure(figsize = (15,8))
ax = sns.boxplot(data=ic_long, x="day", y="count")
ax.set_xlabel("Day of week")
ax.set_ylabel("Count")
ax.set_title("Count of ice cream purchased by day of the week")

plt.show()

In [None]:
df_day_count = pd.Series(ic_long.groupby('day')['count'].sum(), name='count').reset_index()
df_day_count.sort_values(by='count', ascending=True, inplace=True)
df_day_count


In [None]:
ax = sns.barplot(x='day', y='count', data=df_day_count)
ax.set_ylabel("Count")
ax.set_xlabel("Day")
ax.set_title("Bar plot of Day Count")
plt.show()

In [None]:
# Using the ic_long dataframe, create a time-series plot that shows the amount
# of ice cream sold each day of the week.

temp = ic_long.groupby('ID')
print(temp)

print ("############### - print data groupby 'ID' as Key and the row for that ID")
for key, item in temp:
  print(temp.get_group(key))

plt.figure(figsize = (15,8))
for label, grp in ic_long.groupby('ID'): # Actual temp = ic_long.groupby('ID')
    ax = sns.lineplot(x='day', y='count', data=grp, marker='o', label=label) # grp = item (rows for that ID), label = key (i.e. ID)

ax.set_xlabel("Day of week")
ax.set_ylabel("Count")
ax.set_title("Count of ice cream purchased by day of the week")
plt.legend(title='Store ID', ncol=2, loc='best')
plt.show()

In [None]:

ic_wide.set_index('ID') # Use "ID" as  index
print("######## Display the dataframe use ID as index")
print(ic_wide.set_index('ID'))

ic_reformatted = ic_wide.drop(columns='address').set_index('ID')
print(">>>>>>>>>> new dataframe use ID as index and drop address ")
print(ic_reformatted)

plt.figure(figsize = (15,8))

ax = sns.heatmap(data=ic_reformatted)
ax.set_xlabel("Day of week")
ax.set_ylabel("Store ID")
ax.set_title("Count of ice cream purchased by day of the week")
plt.show()

In [None]:
print(ic_wide)
#this is an example of list comprehension.
days_columns = [col for col in ic_wide.columns if col.endswith('day')]
print(days_columns)





In [None]:
# explain: col for col in ic_wide.columns if col.endswith('day')
print(ic_wide.columns) # i.e Header
days_columns2 = []
for col in ic_wide.columns:
  if col.find('es') != -1:
    print(f'{col} - yes')
    days_columns2.append(col)
  else:
    print(f'{col} - no')

print(days_columns2)

In [None]:

# Run this code to get the data frame subset that is only these days.
print(ic_wide[days_columns])

#The sum of all the ice creams for each shop
print("##### Sum each column \n")
print(ic_wide[days_columns].sum())

print(">>>>>>>> Sum each row \n")
print(ic_wide[days_columns].sum(axis=1)) #axis tells pandas whether to do operations by row i.e. each index 0-5
# axis = 1 = row ===>> sum each row



In [None]:
ic_wide["total_per_shop"] = ic_wide[days_columns].sum(axis=1)
print(ic_wide)

In [None]:
# Let's try out the geopy library!
# Run this code to look up the given address and see details.
geolocator = geopy.Nominatim(user_agent="example")
location = geolocator.geocode("201 1st St 94022")
location

In [None]:
# Run this to see even more details.
print(location.raw)

# We can extract the latitude like this.
print(location.latitude)

# We can extract the Longitude like this.
print(location.longitude)

In [None]:
location = geolocator.geocode("4301 University Way NE 98105")
location

print(location.raw)

In [None]:
#Run this to get the latitude for each address. It will store this information in
#the data frame. (ignore the SettingWithCopyWarning, if it appears)
## The apply() function in Pandas allows you to apply a function along an axis of a DataFrame or Series.
## This is extremely useful for performing custom operations on your data.

def getLatitude(address, g):
    location = g.geocode(address, timeout=10000)
    return location.latitude

def getLongitude(address,geolocator):
    location = geolocator.geocode(address, timeout=10000)
    return location.longitude

ic_wide['latitudes'] = ic_wide['address'].apply(getLatitude,args=(geolocator,))
ic_wide['longitude'] = ic_wide['address'].apply(getLongitude,args=(geolocator,))
ic_wide

In [None]:
# Using Seaborn, create a scatterplot for the total_per_shop vs the latitude.
# Give this plot better axis labels and a title.
plt.figure(figsize = (15,8))
ax = sns.scatterplot(data = ic_wide, x = "total_per_shop", y = "latitudes")
ax.set_ylabel("Store latitudes ")
ax.set_xlabel("Number of ice cream")
ax.set_title("Weekly total ice cream sold per store location")
plt.show()