In [None]:
import pandas as pd
import numpy as np

In [None]:
# load data to be used

simple1_df = pd.read_csv('simple_data_cleansing_1.csv')
simple2_df = pd.read_csv('simple_data_cleansing_2.csv')

solo_df = pd.read_csv('solo_table.tsv', sep='\t')

concat_a_df = pd.read_csv('concat_table_a.tsv', sep='\t')
concat_b_df = pd.read_csv('concat_table_b.tsv', sep='\t')
concat_c_df = pd.read_csv('concat_table_c.tsv', sep='\t')

tx_df = pd.read_csv('tx_table.tsv', sep='\t')
shipment_df = pd.read_csv('shipment_table.tsv', sep='\t')

# Excel to Python Lesson 2: Pandas in the Mist

Lesson 1 focused on getting your data loaded into Pandas. 
Now, we'll start manipulating that data into something useful.

This lesson is meant to be introductory level and will act as a survey of things that are possible.
Future lessons will get into the digital weeds.

Lesson Topics:
* Simple Data Cleansing
    * Handling Nulls
    * Handling Duplicates
    * Handling Duplicates
* Table Manipulations
    * Splitting Fields
    * Dropping Columns
    * Aggregations
* Combining Tables
    * Concat/Append - Combining Data
    * Merge - Database Style Joins
* Excel Lyfe
    * VLOOKUP
    * Pivot Tables
    
    
    

## Simple Data Cleansing

### Handling Nulls
`df.dropna() # drops if anything in the row is null`

`df.dropna(how='all') # drops if the entire row is null`

`df.dropna(axis=1, how='all') # drops columns where all data is null`

`df.fillna(0, inplace=True)`
`df.fillna({1: 'X', 2: 0}, inplace=True)`



In [None]:
# Let's look at the data we have
simple1_df

In [None]:
# This will drop any rows that have any nulls
simple1_df.dropna()

In [None]:
# Let's remove only the row that had all nulls
simple1_df.dropna(how='all')

In [None]:
# what if a column was all nulls?
simple2_df.head()

In [None]:
# Let's remove favorite_color as it's all nulls
new_simple_df = simple2_df.dropna(axis=1, how='all')
new_simple_df

In [None]:
# let's drop that null row
new_simple_df = new_simple_df.dropna(how='all')
new_simple_df

In [None]:
# Let's replace the null bird values with wren
new_simple_df.fillna('wren')

In [None]:
# Let's try again by specifying the column to replace
new_simple_df.fillna({'favorite_bird':'wren'}, inplace=True)
new_simple_df

In [None]:
# And do the same for the name
new_simple_df.fillna({'name':'Anonymous User'}, inplace=True)
new_simple_df

### Handling Duplicates

`df.drop_duplicates()  # drops the pure duplicates`

`df.drop_duplicates(['col1', 'col2'], keep='last')`

In [None]:
new_simple_df.drop_duplicates()

In [None]:
new_simple_df.drop_duplicates(['name'], keep='last')

## Single Table Manipulations


In [None]:
solo_df.head()

#### Splitting Columns

In [None]:
# Let's start by splitting the customer_name field into a first_name and last_name
solo_df[['first_name', 'last_name']] = solo_df['customer_name'].str.split(' ', expand=True)
solo_df.head()

#### Dropping Columns

In [None]:
# Let's get rid of customer_name and transit_code
solo_df.drop(['customer_name', 'transit_code'], axis=1, inplace=True)
solo_df.head()

#### Aggregations

##### Order Total Amount by Quarter

If this was SQL, we could do like this:

```
SELECT quarter, SUM(order_total_amt)
FROM solo_df
GROUP BY quarter;
```


In [None]:
solo_df.groupby('quarter').sum()

In [None]:
solo_df.groupby('quarter').order_total_amt.sum()

In [None]:
solo_df.groupby('quarter').order_total_amt.agg(min_order='min', max_order='max', avg_order='mean', total_order='sum')

In [None]:
solo_df.groupby(['last_name']).order_total_amt.sum()

In [None]:
solo_df['running_total'] = solo_df.order_total_amt.cumsum()
solo_df.head(20)

## Combining Tables

### Concat - Combining Data

#### Add more of the same data to the end of a dataframe

In [None]:
concat_a_df.head()

In [None]:
concat_b_df.head()

In [None]:
combined = pd.concat([concat_a_df, concat_b_df], ignore_index=True)
combined

In [None]:
concat_c_df

In [None]:
# What happens here?
pd.concat([concat_a_df, concat_c_df])

In [None]:
# How about now?
pd.concat([concat_a_df, concat_c_df], axis=1)

### Merge - Database Style Joins

In [None]:
tx_df.head()

In [None]:
shipment_df.head()

In [None]:
# Let's join the table together!
tx_shipment_df = pd.merge(tx_df, shipment_df, on='transaction_id')
tx_shipment_df

In [None]:
# do we have all the data?
print(len(tx_df))
print(len(shipment_df))
print(len(tx_shipment_df))

In [None]:
# Why does this matter? Let's sum up the order totals.
print('Transaction Total:', tx_df.order_total_amt.sum())
print('Transaction Shipment Total:', tx_shipment_df.order_total_amt.sum())

In [None]:
# We need to do an outer join
tx_shipment_outer_df = pd.merge(tx_df, shipment_df, on='transaction_id', how='left')
print(len(tx_shipment_outer_df))
print('Transaction Shipment Outer Total:', tx_shipment_outer_df.order_total_amt.sum())

In [None]:
tx_shipment_outer_df

## Excel Lyfe

### Vlookup

In [None]:
vlookup_tx_df = pd.read_excel('vlookup-example_raw.xlsx', sheet_name='transaction')
vlookup_prod_df = pd.read_excel('vlookup-example_raw.xlsx', sheet_name='product_lookup')

In [None]:
vlookup_tx_df

In [None]:
vlookup_prod_df

In [None]:
vlookup_df = pd.merge(vlookup_tx_df, vlookup_prod_df, how='inner', on='product_id')

In [None]:
vlookup_df

### Pivot Tables

In [None]:
tx_df.head()

In [None]:
pd.pivot_table(tx_df, 
               values='order_total_amt', 
               index=['quarter'], 
               columns=['customer_name'], 
               aggfunc=np.sum)

In [None]:
pd.pivot_table(tx_df, 
               values='order_total_amt', 
               index=['quarter', 'customer_name'], 
               aggfunc=np.sum)