In [None]:
# In the previous section we learnt about maps which allow us to transform dataFrame or Series one value at a time. 
# Now, we will learn about groupby function - this allows us to group the data and then do a specific operation

# In this first part we try to replicate the results of value_count function using group_by
reviews.groupby('points').points.count()
# Essentially, here we are grouping the data based on the couloumn 'points'
# Then we want to get the counts for the points. 
# We can get the same result by using the value_counts() function. 
# if we write Reviews.point.value_counts()

# Similarly Groupby functions can be used to replicate the results from any of the summary functions 
reviews.groupby('points').price.min()
# Again the interpretation of this statement is simple - we are grouping the data based on the couloumn 'points' as before
# then we are simply asking for the min price in eash group 

# The next set application is interesting and very powerful as we are going to use the apply function in groups. 
# In this example first we group the data based on winery and then we get the first title in eaxh of these groups.   
reviews.groupby('winery').apply(lambda df: df.title.iloc[0])
# Here lambda df represents the dataframe of each group. 
# Then within the dataframe we focus on the title coloumn and get the 0th index or the first entry

# In the next example we are doing a two level grouping  
# so we have group by based on country and province 
# then we use the apply function on the dataframe to locate the row with the maximum points.
# idxmax function used here - Returns the index of maximum value. 
# Another thing to node about idxmax is that if there are more than one maximas then it gives the first occurance of the maxima
# Also note here that since we are using df.loc so we expect that a dataframe is retured by the apply functions 
reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])
# In the previous example we used df.title.iloc so the output was title. 
# Whereas in this exapme the output is a complete dataframe - since we didnot define the coloumn

# Another thing to note here is format of the output. 
# In the output, we have a dataframe however it has 2 values as indices 
# This is known as multi-index or hirarchical indexing in Pandas - We are going to vising this topic later in this section 

# The next function that we are going to use with groupby is agg() function 
# This function aggregates using one or more operations over the specified axis.
# So in this case first we group by the "country" field 
# and then we try to get the length, min value and max value in each of these groups 
reviews.groupby(['country']).price.agg([len, min, max])
# Some additional operations used with Aggregate functions are - sum, mean, median, prod, std

In [None]:
# Multi-indexes
# So, as we saw earlier in one of the examples that when we group by more than one fields 
# then the output that we get is a dataframe with more than 1 index. This is known as Multi-indexes or hirarchical indexing

countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed
# So, here is the example - first, we are grouping the dataframe based on country and province
# Then we use the agg function and get the length of series description - which counts the number of rows 


mi = countries_reviewed.index
type(mi)
# So in this dataframe if we look at the type of index - we can see here that this it multi.MultiIndex
# Dataframes with MultiIndex also require two levels of labels to retrieve a value.
# You can visit the complete documentation on multiindex here in the link given 

# Dealing with multiindex can be a bit confusing, so we can always use the reset_index() function to reset the index values
countries_reviewed.reset_index()
# Note here that the index values are still preserved in the dataframe as coloumns 

In [None]:
# Sorting 
# Next we move to sorting 
# So we just created countries_reviewed = which had the count of elevlents in each of these groups by country and province
# Then we reset the index of this dataframe 
countries_reviewed = countries_reviewed.reset_index()
# Then we simply sort these based on the len coloumn 
countries_reviewed.sort_values(by='len')
# By default the sort_values function sorts the data in ascending order 

# We can sort it in descending order, simply by using the parameter ascending=False 
countries_reviewed.sort_values(by='len', ascending=False)

# If we want to sort the dataframe based on index. then we can use sort_index function 
countries_reviewed.sort_values(by=['country', 'len'])

# Finally, we have an example where we have sorted the dataframe based on more than one coloumn
countries_reviewed.sort_values(by=['country', 'len'])
# Note here that if you are sorting based on mor than 1 variable, then the 1st variable will be given the 
# highest priority and the variables occouring later will be given lower priority in sorting