# Data Manipulation with Pandas

## 0 - Setup Environment

In [1]:
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


avocados = pd.read_csv(r'datasets/avocado.csv')
# STR - had to extract data from exercise to built avocado_2016.csv
avocados_2016 = pd.read_csv(r'datasets/avocado_2016.csv')
homelessness = pd.read_csv(r'datasets/homelessness.csv')
temperatures = pd.read_csv(r'datasets/temperatures.csv')
# the course loads datasets/walmart.csv into the sales variable; going w/ sales (not walmart; see below)
sales = pd.read_csv(r'datasets/walmart.csv')
# this workspace was initialize by loading datasets/walmart.csv into the walmart variable
# walmart = pd.read_csv(r'datasets/walmart.csv')

## 1 - Transforming DataFrames

### Inspecting a DataFrame

##### Print the head of the homelessness data

In [None]:
print(homelessness.head())

##### Print information about homelessness

In [None]:
print(homelessness.info())

##### Print the shape of homelessness

In [None]:
print(homelessness.shape)

##### Print a description of homelessness

In [None]:
print(homelessness.describe())

### Parts of a DataFrame

##### Print the VALUES of homelessness

In [None]:
print(homelessness.values)

##### Print the COLUMNS INDEX of homelessness

In [None]:
print(homelessness.columns)

##### Print the ROW INDEX of homelessness

In [None]:
print(homelessness.index)

### Sorting Rows

##### Sort homelessness by region, then descending family members

In [None]:
# use a list to pass multiple args to .sort_values(...); otherwise, specify a single value, e.g., .sort_values('region')
homelessness_reg_fam = homelessness.sort_values(['region', 'family_members'], ascending=[True,False])
print(homelessness_reg_fam.head())

### Subsetting Columns

##### Subsetting a Single Column -> Return Type: Series

In [None]:
# if a single, NON-ITERABLE column name is provided in [...], then the return type is a Series
individuals = homelessness['individuals']
print(f'type(individuals): {type(individuals)}')
print(individuals.head())

##### Subsetting a Single Column -> Return Type: DataFrame

In [None]:
# if a single, ITERABLE column name is provided in [...], then the return type is a DataFrame
# NOTE: unlike the previous example, which used 'individuals' as the key, this example uses ['individuals'] as the key
individuals_df = homelessness[['individuals']]
print(f'type(individuals_df): {type(individuals_df)}')
print(individuals_df.head())

##### Subsetting Muliple Columns -> Return Type: DataFrame

In [None]:
state_fam = homelessness[['state', 'family_members']]
print(f'type(state_fam): {type(state_fam)}')
print(state_fam.head())

### Subsetting Rows

##### Subsetting Rows using Compound Predicates

In [None]:
# use bit-wise comparison operator, e.g., &, |, etc.
# each component predicate MUST be enclosed in (...)
fam_lt_1k_pac = homelessness[(homelessness.family_members < 1000) & (homelessness.region == 'Pacific')]
print(fam_lt_1k_pac)

##### Subsetting Rows by Categorical Variables

In [None]:
# The Mojave Desert states
mojave_states = ['Arizona', 'California', 'Nevada', 'Utah']
mojave_homelessness = homelessness[homelessness.state.isin(mojave_states)]
print(mojave_homelessness)

### Adding and/or Dropping Columns

##### Append a New Column to the End of a DataFrame

In [None]:
homelessness["total"] = homelessness.individuals + homelessness.family_members
homelessness.info()

##### Drop a Column from a DataFrame

In [None]:
# the default value for axis = 0, i.e., drop row(s)
# the default value for inplace = False; this creates a copy of the DF (inplace = True does NOT make a copy)
homelessness.drop(axis = 1, columns = 'total', inplace = True)
homelessness.info()

##### Insert a New Column at the Specified Index

In [None]:
# alternative method that allows user to specify the location, i.e, index, where the new column will be inserted
# code below appends column to end of DF; however, any int b/t 0 and len(homelessness.columns) is valid
homelessness.insert(len(homelessness.columns), 'total', homelessness.individuals + homelessness.family_members)
homelessness.info()
# return homelessness to default state, in case it's used later
homelessness.drop(axis = 1, columns = 'total', inplace = True)

## 2 - Aggregating DataFrames

### Mean and Median

In [None]:
print(f'========== Mean Weekly Sales: {sales.weekly_sales.mean()} ==========')
print(f'========== Median Weekly Sales: {sales.weekly_sales.median()} ==========')

### Summarizing Dates

In [None]:
print(f'========== Minimum Sales Date: {sales.date.min()} ==========')
print(f'========== Maximum Sales Date: {sales.date.max()} ==========')

### Efficient Summaries

In [None]:
# Inter-Quantile Range f(x); if a lambda is use the '<lambda>' is used as a row label
# iqr = lambda column: column.quantile(0.75) - column.quantile(0.25)

# Inter-Quantile Range f(x); method name will be use as row label
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# the .agg(...) method allows you to apply your own custom functions to a DataFrame
print(f'========== IQR for Temperature (C): {sales.temperature_c.agg(iqr)} ==========')

# apply multiple functions to multiple columns
print('========== IQR and Median for Temperature (C), Fuel Price and Unemployment ==========')
print(sales[['temperature_c', 'fuel_price_usd_per_l', 'unemployment']].agg([iqr, np.median]))

### Cumulative Statistics

In [None]:
# create a subset of sales (reduces complexity of example); sales_rows_1_12 contains rows 1-12 of sales, i.e., indexes 0-11
sales_rows_1_12 = sales.iloc[0:12, :].sort_values('date')
# sales_rows_1_12 = sales_rows_1_12.sort_values('date')
sales_rows_1_12['cum_weekly_sales'] = sales_rows_1_12.weekly_sales.cumsum()
sales_rows_1_12['cum_max_sales'] = sales_rows_1_12.weekly_sales.cummax()

print(sales_rows_1_12[['date', 'weekly_sales', 'cum_weekly_sales', 'cum_max_sales']])

### Dropping Duplicates

In [None]:
# Drop duplicate store/type combinations
# this does NOT aggregate data; it deletes all dup rows, except the 1st (or last) row for a given store/type combo (default behavior)
# in this EX, 10,762 rows from sales are dropped; the remaining 12 rows are stored in store_types (inplace = F by default)
store_types = sales.drop_duplicates(subset = ['store', 'type'], keep = 'first')
print(store_types.head(12))

### Counting Categorical Variables

##### Count the Number of Stores for Each Type

In [None]:
# store_types is define in the previous step
store_counts = store_types.type.value_counts()
print(store_counts)

##### Row Counts for Each Column -> Return Type: Series

In [None]:
print('========== Row Counts for All Columns ==========')
print(sales.count())

print('========== Row Count for the [store] Column ==========')
print(sales.count()['store'])

##### Get the Proportion of Stores of Each Type

In [None]:
store_props = store_types.type.value_counts(normalize = True)
print(store_props)

##### Count the Number of Each Department Number and Sort

In [None]:
# sorting works the same for proportions, i.e., .value_counts(normalize = True)
# Notice: row sort order differs from 'Count the Number of Stores for Each Type' EX above
store_type_counts_sorted = store_types.type.value_counts(sort = True, ascending = True)
print(store_type_counts_sorted)

### Grouping

##### Calculate Total Weekly Sales

In [None]:
sales_all = sales["weekly_sales"].sum()
print(sales_all)

##### Subset for Type A Stores and Calculate Total Weekly Sales

In [None]:
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
print(sales_A)

##### Group by Type and Calculate Total Weekly Sales -> Return Type: Series

In [None]:
# calling a single aggregation f(x), i.e., .sum(), returns a Series
sales_by_type = sales.groupby('type')['weekly_sales'].sum()
print(sales_by_type)

##### For Each Store Type, Aggregate weekly_sales: get min, max, mean, and median -> Return Type: DataFrame

In [None]:
# calling mulitple aggregation f(x)s, i.e., np.min(), mp.max, etc., returns a DataFrame
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])
print(sales_stats)

##### Group By Multiple Columns

In [None]:
# grouping by multiple columns returns a Series or DataFrame, depending on the number of f(x)s passed to .agg(...)
# the primary difference from grouping on a single column is that grouping on mulitple columns returns a MultiIndex
store_type_stat = sales.groupby(["store", "type"])["weekly_sales"].agg(np.mean)
print(store_type_stat)
print(f'========== Index Data Type: {type(store_type_stat.index)} ==========')

### Pivot Tables - Part 1

##### Pivot for Mean and Median weekly_sales by Store Type and Holiday

In [None]:
# multiple values passed to index arg results in a MultiIndex on the y-axis, i.e., rows
# multiple values passed to columns OR aggfunc args results in a MultiIndex on the x-axis, i.e., columns
mean_median_sales_by_type_holiday = sales.pivot_table(values='weekly_sales', index='type', columns='is_holiday', aggfunc=[np.mean, np.median])
print(f'========== Index Type (single value passed to index arg): {type(mean_median_sales_by_type_holiday.index)} ==========')
print(f'========== Columns Type (multiple values passed to aggfunc arg): {type(mean_median_sales_by_type_holiday.columns)} ==========')
print(mean_median_sales_by_type_holiday)

##### Print Mean weekly_sales by Department and Type; Fill Missing Values with 0

In [3]:
print(sales.pivot_table(values='weekly_sales', index='department', columns='type', fill_value=0))
print(sales.head())

type                    A              B
department                              
1            30961.725379   44050.626667
2            67600.158788  112958.526667
3            17160.002955   30580.655000
4            44285.399091   51219.654167
5            34821.011364   63236.875000
...                   ...            ...
95          123933.787121   77082.102500
96           21367.042857    9528.538333
97           28471.266970    5828.873333
98           12875.423182     217.428333
99             379.123659       0.000000

[80 rows x 2 columns]
   store type  department  ... temperature_c  fuel_price_usd_per_l  unemployment
0      1    A           1  ...      5.727778              0.679451         8.106
1      1    A           1  ...      8.055556              0.693452         8.106
2      1    A           1  ...     16.816667              0.718284         7.808
3      1    A           1  ...     22.527778              0.748928         7.808
4      1    A           1  ...     27.0

## 3 - Slicing and Indexing DataFrames

### Setting and Removing Indexes

##### Index temperatures by city

In [None]:
# inplace = True returns None; inplace = False (default) returns a new object
# column(s) used as indexes are removed from DF by default; they reside in Index/MultiIndex objects
temperatures_ind = temperatures.set_index('city')
print(temperatures_ind)

##### Reset, i.e., remove, Index

In [None]:
# inplace arg behaves same as w/ .set_index(...)
# resetting an index moves index by to column(s)
temperatures_ind.reset_index(inplace = True)
print(temperatures_ind)

### Subsetting with .loc[]

##### Subset temperatures_ind using .loc[]

In [None]:
cities = ['Moscow', 'Saint Petersburg']

# to use .loc[cities], city MUST be in the index
temperatures_ind = temperatures.set_index('city')
print(temperatures_ind.loc[cities])

### Setting Multi-Level Indexes

In [None]:
temperatures_ind = temperatures.set_index(['country', 'city'])
rows_to_keep = [('Brazil', 'Rio De Janeiro'), ('Pakistan', 'Lahore')]

print(temperatures_ind.loc[rows_to_keep])

### Sorting by Index Values

##### Sort Entire Index

In [None]:
print(temperatures_ind.sort_index())

##### Sort a Single Level in a Multi-Level Index

In [None]:
print(temperatures_ind.sort_index(level='city'))

##### Sort Multiple Levels of a Multi-Level Index

In [None]:
print(temperatures_ind.sort_index(level=['country', 'city'], ascending=[True, False]))

### Slicing Index Values

##### Slicing a Single Level of a Multi-Level Index

In [None]:
temperatures_srt = temperatures_ind.sort_index()
print(temperatures_srt.loc['Pakistan':'Russia'])

##### Slicing a Multiple Levels of a Multi-Level Index

In [None]:
print(temperatures_srt.loc[('Pakistan', 'Lahore'):('Russia', 'Moscow')])

##### Slice Columns date:avg_temp_c for All Rows

In [None]:
print(temperatures_srt.loc[:, 'date':'avg_temp_c'])

##### Slice Columns and Rows

In [None]:
print(temperatures_srt.loc[('India', 'Hyderabad'):('Iraq', 'Baghdad'), 'date':'avg_temp_c'])

##### Slicing Time Series

In [None]:
temperatures_ind = temperatures.set_index('date').sort_index()

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc['2010':'2011'])

# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc['2010-08-01':'2011-02-28'])

### Subsetting by Row/Column Number

##### Get 23rd row, 2nd column (index 22, 1)

In [None]:
print(temperatures.iloc[22, 1])

##### Use slicing to get the first 5 rows

In [None]:
print(temperatures.iloc[:5])

##### Use slicing to get columns 3 to 4

In [None]:
print(temperatures.iloc[:, 2:5])

##### Use slicing in both directions at once

In [None]:
print(temperatures.iloc[:5, 2:5])

### Pivot Tables - Part 2

##### Multi-Level Row Index Pivot

In [None]:
# convert date from a string to a datetime type
temperatures.date = pd.to_datetime(temperatures.date, yearfirst = True)
temperatures['year'] = temperatures.date.dt.year

# Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table(values='avg_temp_c', index=['country', 'city'], columns='year')

# See the result
print(temp_by_country_city_vs_year)

##### Subsetting Pivot Tables

In [None]:
# row index: (country, city); column index: year
print(temp_by_country_city_vs_year.loc[('Egypt', 'Cairo'):('India', 'Delhi'), '2005':'2010'])

##### Calculating on a Pivot Table

In [None]:
# Get the worldwide mean temp by year; axis = 'rows' by default, which aggregates rows by columns, i.e., year
mean_temp_by_year = temp_by_country_city_vs_year.mean()

print('========== Year with the Highest Mean Temp ==========')
# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])

# Get the mean temp by city
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis = 'columns')

print('========== Year with the Lowest Mean Temp ==========')
# Filter for the city that had the lowest mean temp
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])

## 4 - Creating and Visualizing DataFrames

### Simple Bar PLot

In [None]:
# Get the total number of avocados sold of each size
nb_sold_by_size = avocados.groupby('size').nb_sold.sum()

nb_sold_by_size.plot(kind = 'bar')
plt.show()

### Simple Time Series Line Plot

In [None]:
# Get the total number of avocados sold on each date
nb_sold_by_date = avocados.groupby('date').nb_sold.sum()

nb_sold_by_date.plot(kind = 'line')
plt.show()

### Scatter Plot - Relational Comparison

In [None]:
avocados.plot(x='nb_sold', y='avg_price', kind='scatter', title='Number of avocados sold vs. average price')
plt.show()

### Histograms - Multiple Plots on the Same Chart

In [None]:
# Modify bins for each historgram to 20
avocados[avocados['type'] == 'conventional']['avg_price'].hist(alpha = 0.5, bins = 20)
avocados[avocados['type'] == 'organic']['avg_price'].hist(alpha = 0.5, bins = 20)

plt.legend(['conventional', 'organic'])
plt.show()

### Finding Missing Values

In [None]:
print('========== Check Individual Values for Missing Values ==========')
print(avocados_2016.isna())

print('========== Check Each Column for Missing Values ==========')
print(avocados_2016.isna().any())

print('========== Bar Plot of Missing Values by Variable ==========')
avocados_2016.isna().sum().plot(kind = 'bar')
plt.show()

### Remove ALL Rows with 1 or More Missing Values

In [None]:
avocados_complete = avocados_2016.dropna()
print(avocados_complete.isna().any())

### Replacing Missing Values

In [None]:
cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
avocados_2016[cols_with_missing].hist()
plt.show()

avocados_filled = avocados_2016.fillna(0)
avocados_filled[cols_with_missing].hist()
plt.show()

### Creating DataFrames

##### List of Dictionaries - Builds DataFrame Row by Row - Each Dictionary Maps to a Row

In [None]:
# Create a list of dictionaries with new data
# Each KVP maps the K to the Column Name & maps the V to the Field Value)
avocados_list = [
    {'date': '2019-11-03', 'small_sold': 10376832, 'large_sold': 7835071},
    {'date': '2019-11-10', 'small_sold': 10717154, 'large_sold': 8561348},
]

avocados_2019 = pd.DataFrame(avocados_list)
print(avocados_2019)

##### Dictionary of Lists - Builds DataFrame Column by Column - Each KVP Maps to a Column

In [None]:
# Create a dictionary of lists with new data
# Each KVP maps the K to the Column Name & maps the V to the Values for entire column
avocados_dict = {
  'date': ['2019-11-17', '2019-12-01'],
  'small_sold': [10859987, 9291631],
  'large_sold': [7674135, 6238096]
}

# Convert dictionary into DataFrame
avocados_2019 = pd.DataFrame(avocados_dict)

# Print the new DataFrame
print(avocados_2019)

In [1]:
from zip_util import compress_folder, decompress_folder
import os, sys
from pathlib import Path

cwd = Path(os.getcwd())
datasets = cwd.joinpath('datasets')
archive = cwd.joinpath('dm_datasets.zip')
compress_folder(datasets, archive, True)

Compressing /work/files/workspace/datasets to /work/files/workspace/dm_datasets.zip
Compressed /work/files/workspace/datasets to /work/files/workspace/dm_datasets.zip
