# Indexing DataFrame

## Setting and removing indexes

In [None]:
# Look at temperatures
print(temperatures)

# Set the index of temperatures to city
temperatures_ind = temperatures.set_index("city")

# Look at temperatures_ind
print(temperatures_ind)

# Reset the temperatures_ind index, keeping its contents
print(temperatures_ind.reset_index())

# Reset the temperatures_ind index, dropping its contents
print(temperatures_ind.reset_index(drop=True))

## Subsetting with .loc[]

In [None]:
# Make a list of cities to subset on
cities = ["Moscow", "Saint Petersburg"]

# Subset temperatures using square brackets
print(temperatures[temperatures["city"].isin(cities)])

# Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[cities])

## Setting multi-level indexes

In [None]:
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country", "city"])

# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]

# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

## Sorting by index values

In [None]:
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())

# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level="city"))

# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending=[True, False]))

# Slicing

## Slicing index values
DataFrame.loc["start": "end"]
    - Include both "start" and "end"

In [None]:
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()

# Subset rows from Pakistan to Russia
print(temperatures_srt.loc["Pakistan": "Russia"])

# Try to subset rows from Lahore to Moscow
print(temperatures_srt.loc["Lahore": "Moscow"])

# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[("Pakistan", "Lahore"): ("Russia", "Moscow")])

## Slicing in both (row and column) directions

In [None]:
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India", "Hyderabad"): ("Iraq", "Baghdad")])

# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:, "date": "avg_temp_c"])

# Subset in both directions at once
print(temperatures_srt.loc[("India", "Hyderabad"): ("Iraq", "Baghdad"), "date": "avg_temp_c"])

## Slicing time series

In [None]:
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temperatures["date"] <= "2011-12-31")]
print(temperatures_bool)

# Set date as the index and sort the index
temperatures_ind = temperatures.set_index("date").sort_index()

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc["2010": "2011"])

# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08": "2011-02"])

## Subsetting by row/column number
DataFrame.iloc[start_idx: end_idx]
    * Include start_idx, but not include end_idx

In [None]:
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22, 1])

# Use slicing to get the first 5 rows
print(temperatures.iloc[:5, :])

# Use slicing to get columns 3 to 4
print(temperatures.iloc[:, 2: 4])

# Use slicing in both directions at once
print(temperatures.iloc[:5, 2: 4])

## Working with pivot_table

In [None]:
# Add a year column to temperatures
temperatures["year"] = temperatures["date"].dt.year
display(temperatures)

# Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index=["country", "city"], columns="year")

# See the result
print(temp_by_country_city_vs_year)

In [None]:
display(temp_by_country_city_vs_year)
# Subset for Egypt to India
display(temp_by_country_city_vs_year.loc["Egypt": "India"])

# Subset for Egypt, Cairo to India, Delhi
display(temp_by_country_city_vs_year.loc[("Egypt", "Cairo"): ("India", "Delhi")])

# Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010
display(temp_by_country_city_vs_year.loc[("Egypt", "Cairo"): ("India", "Delhi"), "2005": "2010"])

In [None]:
display(temp_by_country_city_vs_year)
"""
 year                              2000    2001    2002    2003    2004  ...    2009    2010    2011    2012    2013
    country       city                                                      ...
    Afghanistan   Kabul             15.823  15.848  15.715  15.133  16.128  ...  15.093  15.676  15.812  14.510  16.206
    Angola        Luanda            24.410  24.427  24.791  24.867  24.216  ...  24.325  24.440  24.151  24.240  24.554
    Australia     Melbourne         14.320  14.180  14.076  13.986  13.742  ...  14.647  14.232  14.191  14.269  14.742
                  Sydney            17.567  17.854  17.734  17.592  17.870  ...  18.176  17.999  17.713  17.474  18.090
    Bangladesh    Dhaka             25.905  25.931  26.095  25.927  26.136  ...  26.536  26.648  25.803  26.284  26.587
    ...                                ...     ...     ...     ...     ...  ...     ...     ...     ...     ...     ...
    United States Chicago           11.090  11.703  11.532  10.482  10.943  ...  10.298  11.816  11.214  12.821  11.587
                  Los Angeles       16.643  16.466  16.430  16.945  16.553  ...  16.677  15.887  15.875  17.090  18.121
                  New York           9.969  10.931  11.252   9.836  10.389  ...  10.142  11.358  11.272  11.971  12.164
    Vietnam       Ho Chi Minh City  27.589  27.832  28.065  27.828  27.687  ...  27.853  28.282  27.675  28.249  28.455
    Zimbabwe      Harare            20.284  20.861  21.079  20.889  20.308  ...  20.524  21.166  20.782  20.523  19.756

"""

# Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean(axis="index")
print(mean_temp_by_year)
"""
year
    2000    19.506
    2001    19.679
    2002    19.856
    2003    19.630
    2004    19.672
    2005    19.607
    2006    19.794
    2007    19.854
    2008    19.609
    2009    19.834
    2010    19.912
    2011    19.549
    2012    19.668
    2013    20.312
    dtype: float64
"""

# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])
"""
 year
    2013    20.312
    dtype: float64
"""



# Get the mean temp by city
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
print(mean_temp_by_city)
"""
    country        city
    Afghanistan    Kabul               15.542
    Angola         Luanda              24.392
    Australia      Melbourne           14.275
                   Sydney              17.799
    Bangladesh     Dhaka               26.174
                                        ...
    United States  Chicago             11.331
                   Los Angeles         16.675
                   New York            10.911
    Vietnam        Ho Chi Minh City    27.923
    Zimbabwe       Harare              20.699
    Length: 100, dtype: float64
"""

# Filter for the city that had the lowest mean temp
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])
"""
  country  city
    China    Harbin    4.877
    dtype: float64
"""