In [None]:


import pandas as pd #data analysis and manipulation tool
import numpy as np #mathematical functions
import matplotlib.pyplot as plt #collection of functions creates a figure, creates a plotting area
import seaborn as sns #data visualization library based on matplotlib

# 1b. datasets
possum = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/DAAG/possum.csv")

# 2. The possum data frame consists of nine morphometric measurements on each of
# 104 mountain brushtail possums, trapped at seven sites from Southern Victoria
# to central Queensland (and measured, then released).
# Run this code to see what the possum dataset looks like:
possum.head()
#print(possum.head())



In [None]:
possum.size # total cell

In [None]:
possum.info

In [None]:
# 3. Here is the code for a scatterplot in seaborn:
sns.scatterplot(x="hdlngth",y="skullw",data=possum)
# Notice that the syntax includes the names of the columns "in quotes",
# but the name of the dataframe, possum, is not in quotes.
# Run this code to see what a scatterplot looks like in seaborn.
sns.scatterplot(x="hdlngth",y="skullw",data=possum)
plt.show()

In [None]:
# 4*. We will now save the result of the previous plot to a variable so we can modify it.
ax = sns.scatterplot(x="hdlngth",y="skullw",data=possum)
# These Matplotlib commands allow us to modify some features of the plot.
# This plot is of skull width vs head length of 104 mountain brushtail possums.
# Modify the title and axis labels. Remember to include units on the axes (mm).
ax.set_title("Plot is of skull width vs head length of 104 mountain brushtail possums")
ax.set_xlabel("head length (mm)")
ax.set_ylabel("skull width (mm)")
plt.show()

In [None]:
# 5*. Add a third dimension to encode age using the size of the dots.
# Include size="age" in the list of options you give to scatterplot in this:
ax = sns.scatterplot(x="hdlngth",y="skullw",data=possum)
# label the title and axes appropriately
# age is in years
ax = sns.scatterplot(x="hdlngth",y="skullw",size="age", data=possum)
ax.set_title("Plot is of skull width vs head length of 104 mountain brushtail possums")
ax.set_xlabel("head length (mm)")
ax.set_ylabel("skull width (mm)")
plt.legend(title='Age (Year)')
plt.show()

In [None]:
# 6*. Next, encode the age of each possum as color instead of size.
# hint: use the "hue" option.
ax = sns.scatterplot(x="hdlngth",y="skullw",hue="age", data=possum)
ax.set_title("Plot is of skull width vs head length of 104 mountain brushtail possums")
ax.set_xlabel("head length (mm)")
ax.set_ylabel("skull width (mm)")
plt.legend(title='Age (Year)')
plt.show()

In [None]:
# 7*. Next, encode the sex of each possum as hue, instead of the age.
ax = sns.scatterplot(x="hdlngth",y="skullw",hue="sex", data=possum)
ax.set_title("Plot is of skull width vs head length of 104 mountain brushtail possums")
ax.set_xlabel("head length (mm)")
ax.set_ylabel("skull width (mm)")
plt.legend(title='Sex')
plt.show()

In [None]:
# 8*. Next, encode the sex of each possum as both hue and the shape of the marker.
# hint: the shape is encoded using "style" option
ax = sns.scatterplot(x="hdlngth",y="skullw",hue="sex", style="sex", data=possum)
ax.set_title("Plot is of skull width vs head length of 104 mountain brushtail possums")
ax.set_xlabel("head length (mm)")
ax.set_ylabel("skull width (mm)")
plt.legend(title='Sex')
plt.show()

In [None]:
# 9*. Finally, let's see how the skull width and head length are correlated.
# Here is the plot with a best-fit line. Run this code and answer the question:
# is the correlation positive or negative?
# Your answer here:
sns.lmplot(x="hdlngth",y="skullw",data=possum)
plt.show()
# positive correlation

In [None]:
"""## Distributions"""

# 10*. Run this code to plot a histogram of the ages of the possums.
# Label the x axis and title appropriately.
ax = sns.histplot(x="age",data=possum)
ax.set_ylabel('frequency')
ax.set_xlabel('age')
ax.set_title("Histogram of the ages of the possums")
plt.show()

In [None]:
# 11*. The above histogram is misleading since the ages are whole integer numbers.
# We can't tell whether the count reflected is the age on the left or the right.
# It would be better to plot each bar centered at the integer number. This will
# also avoid the potential problem of having two ages combined at one end of the range.

# Copy and paste the code from above, and modify the plot by adding the "discrete"
# command.
# Reference page: https://seaborn.pydata.org/generated/seaborn.histplot.html

ax = sns.histplot(x="age",discrete=True, data=possum)
ax.set_ylabel('frequency')
ax.set_xlabel('age')
ax.set_title("Histogram of the ages of the possums")
plt.show()

In [None]:
# 13*. Create a violin plot of the length of the possums at each site.
# Hint: x will map to site, y will map to totlngth.
ax = sns.violinplot(x='site', y='totlngth', data=possum)
ax.set_ylabel('Length')
ax.set_xlabel('Site')
ax.set_title("Length of the possums at each site")
plt.show()

In [None]:
# 15. Run this code to see what the vocabulary dataset looks like:
vocab = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/carData/Vocab.csv")
vocab.head()
print (vocab.head())


In [None]:
vocab.info

In [None]:
# 16. Run this code to summarize the vocab by year and see the results:
vocab_summary_by_year = vocab.groupby("year")[['education','vocabulary']].mean()
#The reset_index() method allows you reset the index back to the default 0, 1, 2 etc indexe
vocab_summary_by_year.reset_index(inplace=True)
'''
The reset_index() function is straightforward to use, but it's important to remember that it doesn't modify the original DataFrame.
Instead, it returns a new DataFrame with the reset index. To modify the original DataFrame, you need to use the inplace=True argument.
'''
vocab_summary_by_year.head()
print(vocab_summary_by_year.head())

In [None]:

# 17*. Create line plots to see how education and vocabulary have changed over
# the years. You will not use the vocab dataset! Instead, you will use the
# vocab_summary_by_year dataset, which has already aggregated the values from
# the original vocab dataset.
# Don't forget labels!
# Hint: the x-axis should have the year.
# Hint: sns.lineplot
# Hint: Look up the dataset here so that you can put units on the labels:
# https://vincentarelbundock.github.io/Rdatasets/doc/carData/Vocab.html

# 17a*. Education line plot:
ax = sns.lineplot(x='year', y='education', data=vocab_summary_by_year)
ax.set_ylabel('Education (years)')
ax.set_xlabel('Year of the survey')
ax.set_title("Education change over years")
plt.show()

In [None]:
# 17b*. Vocabulary line plot:
ax = sns.lineplot(x='year', y='vocabulary', data=vocab_summary_by_year)
ax.set_ylabel('Vocabulary test score: number correct on a 10-word test')
ax.set_xlabel('Year of the survey')
ax.set_title("Vocabulary change over years")
plt.show()

In [None]:
# 17c*. Have education and vocabulary both changed? Does change in education at this
# point seem to relate to vocabulary scores?
# Both have been changed. Seems no related between two

# 18*. We'd like to see more detail about how education and vocab relate.
# Let's try this scatterplot.
sns.scatterplot(x="education",y="vocabulary",data=vocab)
# With this plot, it can't tell the relationship between education and vocabulary because of distribution only
plt.show()

In [None]:
# Hmm, it does not give us much information. Why not?
# What is it about this plot that makes it impossible for us to see the
# underlying information?
#

# 19*. Here are two more ways of visualizing the relationship between education
# and vocabulary. What information do each of these plots give in terms of
# understanding the relationship between these two variables?
# plot (a) - Note: this takes a moment to run, due to the data size and particular
# plot type
'''
The joint plot is a way of understanding the relationship between two variables and the distribution of individuals of each variable.
The joint plot mainly consists of three separate plots in which, one of it was the middle figure that is used to see the relationship between x and y.
'''
# kind: It is a protocol to draw - The kernel density estimation (kde) procedure visualize a bivariate distribution.
sns.jointplot(x="education", y="vocabulary", data=vocab, kind="kde")

In [None]:
# can tell us which education year with what test scores
ax = sns.lmplot(x="education",y="vocabulary",data=vocab)
plt.show()

In [None]:
ice_cream_dict = {"Shop": ['Cold Stone','Baskin Robbins','Tin Pot Creamery','Cream'],"Chocolate": [2,4,6,1],"Vanilla":[4,2,1,4],"Mango":[2,3,4,0],"Mint":[4,1,1,0],"Coconut":[1,2,1,3]}
ice_cream = pd.DataFrame(ice_cream_dict)
ice_cream.set_index("Shop",inplace=True)
# 20*. Run this code to see the ice cream dataset. Is this long or wide data?
ice_cream
print (ice_cream)


In [None]:
# 21. This data is already in the correct form for plotting in a heat map. Run this
# code to see what the default heat map of this data looks like.
ax = sns.heatmap(ice_cream,center=0)
ax.set_ylabel('Shop')
ax.set_xlabel('Favor')
ax.set_title("Heat map between shop and favor")
plt.show()

In [None]:

# 22*. Customize this heat map by doing both of these changes:
# - Turn on annotations (annot=True)
# - Specify a color map (cmap=) with an appropriate color map option.
# Here are some example color maps:
# https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/
ax = sns.heatmap(ice_cream,center=0, annot=True, cmap="PiYG")
ax.set_ylabel('Shop')
ax.set_xlabel('Favor')
ax.set_title("Heat map between shop and favor")
plt.show()