In [None]:
#import libaries we will need throughout the file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
'''
read the file in from your local computer into a pandas data frame. Each row is one observation and each column is a variable
'''
vacationdata = pd.read_csv('Vacation Data.csv')

In [None]:
#see (and print) all variables in the dataset
origcolumnnames = vacationdata.columns
origcolumnnames

In [None]:
#Use a regular expression to remove the leading text from variable names (don't worry too much about how this works)
newcolumnnames = origcolumnnames.str.replace("Q\d+:.", "")

'''
rename the dataset columns to be our new, more descriptive column names 
(note this doesn't change the file stored on your computer, just how we look at it in python)
'''
vacationdata.columns = newcolumnnames

vacationdata.columns

In [None]:
'''
Look at the counts of observations based on characteristics of interest. This is called a cross-tabulation
(the margins paramter gives us the subtotal column). 
What does this visualization tell youabout the data?
'''
vacationbyseason = pd.crosstab(vacationdata["Vacation City"], vacationdata["Season"], margins=True, margins_name="Subtotal")
vacationbyseason

In [None]:
import scipy.stats as st
#create cross-tab without margins
vacationbyseasonwithoutsubtotal = pd.crosstab(vacationdata["Vacation City"], vacationdata["Season"])

#Calculate statistics for the chi^2 goodness of fit test - used for categorical data to see if each cell matches what we expect

'''
C in this case is the aggregate test statistic (short for chi^2). 
    If it's sufficiently large (based on the degrees of freedom), 
    then this data is very unlikely to be seen in the wild given our expectation
    
p is the inverse of C 
    e.g. what is the probability [0-1] of seeing this data in the wild given our initial hypothesis 
    that data matches our expectation. THe smaller the value, the less likely it is to be seen (and we can therefore
    be statistically confident that something is different about this data)
    
dof is the degrees of freedom (number of independent variables). 
    Each statistical test (t,z, chi^2) has a different formula on how to calculate this

expected is the matrix showing the expected value for each (season, city) pair. 
    This is used in calculating the chi^2 value
'''
c, p, dof, expected = st.chi2_contingency(vacationbyseasonwithoutsubtotal)
print('chi^2=%s p=%s degrees of freedom=%s' % (c,p,dof))

In [None]:
#Here we make a cross-tab on 3 values instead (specify city and season on the left, cost on the top)
vacationseasoncost = pd.crosstab([vacationdata["Vacation City"],vacationdata["Season"]], vacationdata.iloc[:,30], colnames=["cost"], margins=True)
vacationseasoncost

In [None]:
#just as before, we can calculate the relevant chi^squared value
vacationseasoncostwithoutsubtotal = pd.crosstab([vacationdata["Vacation City"],vacationdata["Season"]], vacationdata.iloc[:,30], colnames=["cost"])

c, p, dof, expected = st.chi2_contingency(vacationseasoncostwithoutsubtotal)
print('chi^2=%s p=%s degrees of freedom=%s' % (c,p,dof))

In [None]:
#the describe function gives us several descriptive statistics on a column

st.describe(vacationdata["Trip Duration in Days"])

In [None]:
#You can also call any descriptive stat function manually on the series (note std dev is the square root of variance)
median = vacationdata["Trip Duration in Days"].median()
stddev = vacationdata["Trip Duration in Days"].std()

print("median = %s standard deviation=%s" %(median, stddev))

In [None]:
#box plot data. In this case, 6 is greater than 25% of the data and 14 is the max value 
vacationdata["Trip Duration in Days"].quantile([0,.25,.5,.75,1])

In [None]:
#box plot visualization of multiple columns side by side. What does this tell you?
#can call directly from pandas, calls matplotlib under the hood
vacationdata.boxplot(column=["Trip Duration in Days", "Age of Traveler"])

In [None]:
#histogram specifying custom bins
vacationdata.hist(["Age of Traveler"], bins=range(10,100,2))

In [None]:
#histogram with auto bins, multiple plots
vacationdata.hist(["Trip Duration in Days", "Number of Travelers"])

In [None]:
#correlation matrix (how much does data trend in the same direction for the variables of the associated row and col)

#why are all the diagonal values 1?
vacationdata.corr()

In [None]:
#let's look at the correlation between these two values
vacationdata[["Airfare Cost","Total Trip Cost (includes air)"]].corr()

In [None]:
#That's weird, why didn't it show? Let's look at the data types (since correlation only works on numeric)
vacationdata.dtypes

In [None]:
#Ok, so they're not numbers. Let's tell pandas to treat them as numbers using the to_numeric function
vacationdata[["Airfare Cost", "Total Trip Cost (includes air)"]] = vacationdata[["Airfare Cost", "Total Trip Cost (includes air)"]].apply(pd.to_numeric)

In [None]:
#no wonder we can't compute a correlation, it has extra symbols in there! Python isn't smart enough to remove them
#on its own. Let's use the string replace function to remove the bad symbols with nothing (the empty string)
# and use the apply function to call string replace on every row in the data frame
vacationdata[["Airfare Cost", "Total Trip Cost (includes air)"]] = vacationdata[["Airfare Cost", "Total Trip Cost (includes air)"]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.int64)

In [None]:
#now we should be able to create a correlation!
vacationdata[["Airfare Cost","Total Trip Cost (includes air)"]].corr("pearson")

In [None]:
#scatterplot
plt.scatter(vacationdata["Airfare Cost"], vacationdata["Total Trip Cost (includes air)"],  alpha=0.5)
plt.title('Cost relationship')
plt.xlabel('Airfare cost')
plt.ylabel('Total cost')
plt.show()

In [None]:
#1-d bar chart
charts = vacationdata["Season"].value_counts()
#charts.columns
charts.plot.bar()

In [None]:
#stacked bar chart

#first, let's group the data to get a bit more granular count. This should look similar to our first cross-tab
#, just laid out differently
groupedcitydata = vacationdata[["Season", "Vacation City"]].groupby(["Season", "Vacation City"]).size().reset_index(name='counts')
groupedcitydata



In [None]:
#make a list of length 6 (the number of cities we have in the dataset)
ind = np.arange(6)

#for each row where the season had the value fall, what is the count?
fallcounts = groupedcitydata.loc[groupedcitydata["Season"] == "Fall", "counts"]
fallcounts

In [None]:
#continue to do this for each season
springcounts = groupedcitydata.loc[groupedcitydata["Season"] == "Spring", "counts"]

summercounts = groupedcitydata.loc[groupedcitydata["Season"] == "Summer", "counts"]

wintercounts = groupedcitydata.loc[groupedcitydata["Season"] == "Winter", "counts"]


#sometimes numpy and pandas don't always play nicely together. In this case, in order to use numpy's add function,
# call the .values property on the Pandas Series (1-D dataframe) to get a numpy array directly
fallbar = plt.bar(ind, fallcounts)
winterbar = plt.bar(ind, wintercounts.values,bottom=fallcounts.values)
springbar = plt.bar(ind, springcounts.values, bottom=np.add(fallcounts.values,wintercounts.values))
summerbar = plt.bar(ind, summercounts.values, bottom=np.add(np.add(wintercounts.values,springcounts.values),fallcounts.values))


plt.ylabel('Freq')
plt.title('Travel Destination by season')
#we know this is the order from hwow the grouped dataset looks
plt.xticks(ind, ('Capetown', 'London', 'Moscow', 'Paris', 'Sydney', 'Tokyo'))
plt.yticks(np.arange(0, 111, 10))
plt.legend((fallbar[0], winterbar[0], springbar[0], summerbar[0]), ('Fall', 'Winter', 'Spring', 'Summer'))
plt.show()
plt.show()
