# Pandas notes

In [None]:
from IPython.display import Image, SVG

Image(filename='Images/multtimeseries.png')
SVG(filename='Images/multtimeseriesslice1.svg')

# 1. Create untidy data with - pd.melt()
- melt = turn columns into rows

In [None]:
"""
params:
id_vars - columns to keep fixed, NOT melt
value_vars - columns to melt, default to melt all columns
var_name and value_name - rename columns
"""
pd.melt(frame=df, id_vars='name', value_vars=['treatment a', 'treatment b'],
       var_name='treatment', value_name='result')

In [None]:
# Example: melt tidy data into untidy form

# Combine Ozone, Solar.R, Wind, and Temp columns into 1 column

# Print the head of airquality
print(airquality.head())
#    Ozone  Solar.R  Wind  Temp  Month  Day
# 0   41.0    190.0   7.4    67      5    1
# 1   36.0    118.0   8.0    72      5    2
# 2   12.0    149.0  12.6    74      5    3
# 3   18.0    313.0  11.5    62      5    4
# 4    NaN      NaN  14.3    56      5    5

# Melt airquality: airquality_melt
airquality_melt = pd.melt(frame=airquality, id_vars=['Month', 'Day'])

# Print the head of airquality_melt
print(airquality_melt.head())
#    Month  Day variable  value
# 0      5    1    Ozone   41.0
# 1      5    2    Ozone   36.0
# 2      5    3    Ozone   12.0
# 3      5    4    Ozone   18.0
# 4      5    5    Ozone    NaN

In [None]:
# specify melted column names with var_name and value_name arg
# Melt airquality: airquality_melt
airquality_melt = pd.melt(frame=airquality, id_vars=['Month', 'Day'], 
                          var_name='measurement', value_name='reading')

# 2. Pivot - un-melt data with pivot()
- opposite of melting
- pivot = turn unique values into separate columns
- args:
    - index - columns you don't want to pivot
    - columns - columns you want to pivot
    - values - values to use

In [None]:
weather_tidy = weather.pivot(index='date', columns='element', values='value')

## 2.1 Using pivot_table

In [None]:
# Print the head of airquality_melt
print(airquality_melt.head())
#        Month  Day measurement  reading
#     0      5    1       Ozone     41.0
#     1      5    2       Ozone     36.0
#     2      5    3       Ozone     12.0
#     3      5    4       Ozone     18.0
#     4      5    5       Ozone      NaN
# Pivot airquality_melt: airquality_pivot
airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], 
                                               columns='measurement', 
                                               values='reading')

# Print the head of airquality_pivot
print(airquality_pivot.head())
#     measurement  Ozone  Solar.R  Temp  Wind
#     Month Day                              
#     5     1       41.0    190.0  67.0   7.4
#           2       36.0    118.0  72.0   8.0
#           3       12.0    149.0  74.0  12.6
#           4       18.0    313.0  62.0  11.5
#           5        NaN      NaN  56.0  14.3

## 2.2 Resetting index of a dataframe (get rid of multiindex)

In [None]:
# Print the index of airquality_pivot
print(airquality_pivot.index)

# Reset the index of airquality_pivot: airquality_pivot_reset
airquality_pivot_reset = airquality_pivot.reset_index()

# Print the new index of airquality_pivot_reset
print(airquality_pivot_reset.index)

# Print the head of airquality_pivot_reset
print(airquality_pivot_reset.head())

## 2.3 Using pivot when you have duplicate entries - pivot_table()
- has parameter to deal with duplicate values
- example: can aggregate the duplicate values by taking their average

In [None]:
# pivot_table with aggfunc
weather2_tidy = weather.pivot_table(values = 'value',
                                    index='date', 
                                    columns='element', 
                                    aggfunc=np.mean)

In [None]:
# Pivot airquality_dup: airquality_pivot
airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], 
                                              columns='measurement', 
                                              values='reading', 
                                              aggfunc=np.mean)

# Reset the index of airquality_pivot
airquality_pivot = airquality_pivot.reset_index()

# Print the head of airquality_pivot
print(airquality_pivot.head())

# Print the head of airquality
print(airquality.head())


# 3. Melting and Parsing
- separate 1 column with value that has 2 elements - into 2 separate columns

In [None]:
# variable column has age and sex - ie. m014
pd.melt(frame=tb, id_vars=['country', 'year'])
# create 'gender' column
tb_melt['gender'] = tb_melt.variable.str[0]
# create age column
tb_melt['age_group'] = tb_melt.variable.str[1:]

# 4. Splitting a column with .split() and .get()
- use when multiple variables are stored in columns with a delimiter, ie. '_'
    - ie. column name Deaths_Guinea

In [None]:
# Melt ebola: ebola_melt
ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')

# Create the 'str_split' column
ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')

# Create the 'type' column
ebola_melt['type'] = ebola_melt['str_split'].str.get(0)

# Create the 'country' column
ebola_melt['country'] = ebola_melt['str_split'].str.get(1)

# Print the head of ebola_melt
print(ebola_melt.head())


# 5. Concatenating many files - glob() to find files based on pattern
Globbing
- pattern matching for file names
- Wildcards: *?
    - '*' for any string
        - example: Any csv file: *.csv
    - ? for one character
        - example: Any single character: file_?.csv
- Returns a list of file names
- Use this list to load into separate DataFrames

Plan
- load files from globbing into pandas
- add dataframes into a list
- concatenate multiple datasets at once

In [None]:
import glob
# find and concatenate
csv_files = glob.glob('*.csv')
print(csv_files)

# create list of dataframes using a for loop 
list_data = []
for filename in csv_files:
    data = pd.read_csv(filename)
    list_data.append(data)
pd.concat(list_data)

In [None]:
# example: Find files that match a pattern

# Import necessary modules
import glob
import pandas as pd

# Write the pattern: pattern
pattern = '*.csv'

# Save all file matches: csv_files
csv_files = glob.glob(pattern)

# Print the file names
print(csv_files)

# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])

# Print the head of csv2
print(csv2.head())

In [None]:
# example: iterate and concatenate df for all matches
# Create an empty list: frames
frames = []

#  Iterate over csv_files
for csv in csv_files:

    #  Read csv into a DataFrame: df
    df = pd.read_csv(csv)
    
    # Append df to frames
    frames.append(df)

# Concatenate frames into a single DataFrame: uber
uber = pd.concat(frames)

# Print the shape of uber
print(uber.shape)

# Print the head of uber
print(uber.head())

# 6. Merge
3 types
- one to one
- many to one / one to many
- many to many

Use same function pd.merge()

## 6.1 merge: one to one

In [None]:
pd.merge(left=state_populations, right=state_codes,
        on=None, left_on='state', right_on='name')

In [None]:
# Merge the DataFrames: o2o
o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

## 6.2 merge: many to one / one to many
- for duplicates in key, both DataFrames do not have unique keys for a merge 
- What happens here is that for each duplicated key, every pairwise combination will be created.

In [None]:
# Merge the DataFrames: m2o
m2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

## 6.3 merge: many to many

In [None]:
# Merge site and visited: m2m
m2m = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Merge m2m and survey: m2m
m2m = pd.merge(left=m2m, right=survey, left_on='ident', right_on='taken')

# 7. Data types

In [None]:
df.dtypes
df['column'].astype(str)
df['column'].astype(categorical)

In [None]:
# object (string) to numeric type with coerce (coerces NaNs)
pd.to_numeric(df['column a'], errors='coerce')

# 8. String manipulation with regex
- import re

Examples
- 17   \d*
- $17  \$\d*
- $17.00 \$\d*\.\d*
- Specify 2 digit decimal: $17.89
    - \$\d*\.\d{2}
- ^ at beginning and $ at end
- $17.895  ^\$\d*\.\d{2}$

## 8.1 Using regular expressions (aka regex)
- compile the pattern
- use the pattern to match values


In [None]:
# Example: regex
import re
pattern = re.compile('\$\d*\.\d{2}')
result = pattern.match('$17.89')
bool(result)

In [None]:
# Example: regex for phone number matching
# Import the regular expression module
import re

# Compile the pattern: prog
prog = re.compile('\d{3}-\d{3}-\d{4}')

# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

# See if the pattern matches
result2 = prog.match('1123-456-7890')
print(bool(result2))

# output
# True
# False

In [None]:
# Example: extract numbers from strings
# Find numbers in string: 'the recipe calls for 10 strawberries and 1 banana'
# Import the regular expression module
import re

# Find the numeric values: matches
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana')

# Print the matches
print(matches)

In [None]:
# Examples: mix of regex patterns
# Write the first pattern
pattern1 = bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890'))
print(pattern1)

# Write the second pattern
pattern2 = bool(re.match(pattern='\$\d{3}', string='$123.45'))
print(pattern2)

# Write the third pattern
pattern3 = bool(re.match(pattern='\d*', string='Australia'))
print(pattern3)

## 8.2 Custom functions to clean data using regex

In [None]:
# Example: use a custom function to apply regex to create a new column
import re
from numpy import NaN
pattern = re.compile('^\$\d*\.\d{2}$')

# function
def diff_money(row, pattern):
    # slice the row
    icost = row['Initial Cost']
    tef = row['Total Est. Fee']
    
    # if valid value, then convert string to float type
    if bool(pattern.match(icost)) and bool(pattern.match(tef)):
        icost = icost.replace("$", "")
        tef = tef.replace("$","")
        
        icost = float(icost)
        tef = float(tef)
        
        # return difference of the values
        return icost - tef
    else:
        return(NaN)
    
# apply function
df_subset['diff'] = df_subset.apply(diff_money, axis=1, pattern=pattern)

### 8.2.1 Custom functions to clean data
You'll now practice writing functions to clean data.

The tips dataset has been pre-loaded into a DataFrame called tips. It has a 'sex' column that contains the values 'Male' or 'Female'. Your job is to write a function that will recode 'Male' to 1, 'Female' to 0, and return np.nan for all entries of 'sex' that are neither 'Male' nor 'Female'.

Recoding variables like this is a common data cleaning task. Functions provide a mechanism for you to abstract away complex bits of code as well as reuse code. This makes your code more readable and less error prone.

As Dan showed you in the videos, you can use the .apply() method to apply a function across entire rows or columns of DataFrames. However, note that each column of a DataFrame is a pandas Series. Functions can also be applied across Series. Here, you will apply your function over the 'sex' column.

In [None]:
# Define recode_sex()
def recode_sex(sex_value):

    # Return 1 if sex_value is 'Male'
    if sex_value == 'Male':
        return 1
    
    # Return 0 if sex_value is 'Female'    
    elif sex_value == 'Female':
        return 0
    
    # Return np.nan    
    else:
        return np.nan

# Apply the function to the sex column
tips['sex_recode'] = tips.sex.apply(recode_sex)

# Print the first five rows of tips
print(tips.head())

### 8.2.2 Lambda functions

In [None]:
# Example: try removing $ sign from a column using 2 methods
# Method 1: use .replace(), Method 2: use regex

# Write the lambda function using replace
tips['total_dollar_replace'] = tips.total_dollar.apply(
    lambda x: x.replace('$', ''))

# Write the lambda function using regular expressions
tips['total_dollar_re'] = tips.total_dollar.apply(
    lambda x: re.findall('\d+\.\d+', x)[0])

# Print the head of tips
print(tips.head())


# 9. Duplicate data
- .drop_duplicates()

In [None]:
# drop duplicates
df = df.drop_duplicates()

# 10. Missing data
Handling options:
- leave as is
- drop them
- fill missing value

Question
- is it random
- or is it a systemic problem

In [None]:
# Drop missing values
tips_dropped = tips_nan.dropna()
tips_dropped info()

In [None]:
# Fill missing values with .fillna() - options: provided value, summary statistic
tips_nan['sex'] = tips_nan['sex'].fillna('missing')
# fill mssing values from multiple columns
tips_nan[['total_bill', 'size']] = tips_nan[['total_bill', 'size']].fillna(0)

In [None]:
# Fill missing value with a test statistic
mean_value = tips_nan['tip'].mean()
print(mean_value)
tips_nan['tip'] = tips_nan['tip'].fillna(mean_value)
tips_nan.info()

In [None]:
Image(filename='Images/multtimeseries.png')
SVG(filename='Images/multtimeseriesslice1.svg')