# Pandas notes

In [None]:
from IPython.display import Image, SVG

Image(filename='Images/multtimeseries.png')
SVG(filename='Images/multtimeseriesslice1.svg')

# 1. Create untidy data with - pd.melt()
- melt = turn columns into rows

In [None]:
"""
params:
id_vars - columns to keep fixed, NOT melt
value_vars - columns to melt, default to melt all columns
var_name and value_name - rename columns
"""
pd.melt(frame=df, id_vars='name', value_vars=['treatment a', 'treatment b'],
       var_name='treatment', value_name='result')

In [None]:
# Example: melt tidy data into untidy form

# Combine Ozone, Solar.R, Wind, and Temp columns into 1 column

# Print the head of airquality
print(airquality.head())
#    Ozone  Solar.R  Wind  Temp  Month  Day
# 0   41.0    190.0   7.4    67      5    1
# 1   36.0    118.0   8.0    72      5    2
# 2   12.0    149.0  12.6    74      5    3
# 3   18.0    313.0  11.5    62      5    4
# 4    NaN      NaN  14.3    56      5    5

# Melt airquality: airquality_melt
airquality_melt = pd.melt(frame=airquality, id_vars=['Month', 'Day'])

# Print the head of airquality_melt
print(airquality_melt.head())
#    Month  Day variable  value
# 0      5    1    Ozone   41.0
# 1      5    2    Ozone   36.0
# 2      5    3    Ozone   12.0
# 3      5    4    Ozone   18.0
# 4      5    5    Ozone    NaN

In [None]:
# specify melted column names with var_name and value_name arg
# Melt airquality: airquality_melt
airquality_melt = pd.melt(frame=airquality, id_vars=['Month', 'Day'], 
                          var_name='measurement', value_name='reading')

# 2. Pivot - un-melt data with pivot()
- opposite of melting
- pivot = turn unique values into separate columns
- args:
    - index - columns you don't want to pivot
    - columns - columns you want to pivot
    - values - values to use

In [None]:
weather_tidy = weather.pivot(index='date', columns='element', values='value')

## 2.1 Using pivot_table

In [None]:
# Print the head of airquality_melt
print(airquality_melt.head())
#        Month  Day measurement  reading
#     0      5    1       Ozone     41.0
#     1      5    2       Ozone     36.0
#     2      5    3       Ozone     12.0
#     3      5    4       Ozone     18.0
#     4      5    5       Ozone      NaN
# Pivot airquality_melt: airquality_pivot
airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], 
                                               columns='measurement', 
                                               values='reading')

# Print the head of airquality_pivot
print(airquality_pivot.head())
#     measurement  Ozone  Solar.R  Temp  Wind
#     Month Day                              
#     5     1       41.0    190.0  67.0   7.4
#           2       36.0    118.0  72.0   8.0
#           3       12.0    149.0  74.0  12.6
#           4       18.0    313.0  62.0  11.5
#           5        NaN      NaN  56.0  14.3

## 2.2 Resetting index of a dataframe (get rid of multiindex)

In [None]:
# Print the index of airquality_pivot
print(airquality_pivot.index)

# Reset the index of airquality_pivot: airquality_pivot_reset
airquality_pivot_reset = airquality_pivot.reset_index()

# Print the new index of airquality_pivot_reset
print(airquality_pivot_reset.index)

# Print the head of airquality_pivot_reset
print(airquality_pivot_reset.head())

## 2.3 Using pivot when you have duplicate entries - pivot_table()
- has parameter to deal with duplicate values
- example: can aggregate the duplicate values by taking their average

In [None]:
# pivot_table with aggfunc
weather2_tidy = weather.pivot_table(values = 'value',
                                    index='date', 
                                    columns='element', 
                                    aggfunc=np.mean)

In [None]:
# Pivot airquality_dup: airquality_pivot
airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], 
                                              columns='measurement', 
                                              values='reading', 
                                              aggfunc=np.mean)

# Reset the index of airquality_pivot
airquality_pivot = airquality_pivot.reset_index()

# Print the head of airquality_pivot
print(airquality_pivot.head())

# Print the head of airquality
print(airquality.head())


# 3. Melting and Parsing
- separate 1 column with value that has 2 elements - into 2 separate columns

In [None]:
# variable column has age and sex - ie. m014
pd.melt(frame=tb, id_vars=['country', 'year'])
# create 'gender' column
tb_melt['gender'] = tb_melt.variable.str[0]
# create age column
tb_melt['age_group'] = tb_melt.variable.str[1:]

# 4. Splitting a column with .split() and .get()
- use when multiple variables are stored in columns with a delimiter, ie. '_'
    - ie. column name Deaths_Guinea

In [None]:
# Melt ebola: ebola_melt
ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')

# Create the 'str_split' column
ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')

# Create the 'type' column
ebola_melt['type'] = ebola_melt['str_split'].str.get(0)

# Create the 'country' column
ebola_melt['country'] = ebola_melt['str_split'].str.get(1)

# Print the head of ebola_melt
print(ebola_melt.head())


# 5. Concatenating many files - glob() to find files based on pattern
Globbing
- pattern matching for file names
- Wildcards: *?
    - '*' for any string
        - example: Any csv file: *.csv
    - ? for one character
        - example: Any single character: file_?.csv
- Returns a list of file names
- Use this list to load into separate DataFrames

Plan
- load files from globbing into pandas
- add dataframes into a list
- concatenate multiple datasets at once

In [None]:
import glob
# find and concatenate
csv_files = glob.glob('*.csv')
print(csv_files)

# create list of dataframes using a for loop 
list_data = []
for filename in csv_files:
    data = pd.read_csv(filename)
    list_data.append(data)
pd.concat(list_data)

In [None]:
# example: Find files that match a pattern

# Import necessary modules
import glob
import pandas as pd

# Write the pattern: pattern
pattern = '*.csv'

# Save all file matches: csv_files
csv_files = glob.glob(pattern)

# Print the file names
print(csv_files)

# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])

# Print the head of csv2
print(csv2.head())

In [None]:
# example: iterate and concatenate df for all matches
# Create an empty list: frames
frames = []

#  Iterate over csv_files
for csv in csv_files:

    #  Read csv into a DataFrame: df
    df = pd.read_csv(csv)
    
    # Append df to frames
    frames.append(df)

# Concatenate frames into a single DataFrame: uber
uber = pd.concat(frames)

# Print the shape of uber
print(uber.shape)

# Print the head of uber
print(uber.head())

# 6. Merge
3 types
- one to one
- many to one / one to many
- many to many

Use same function pd.merge()

## 6.1 merge: one to one

In [None]:
pd.merge(left=state_populations, right=state_codes,
        on=None, left_on='state', right_on='name')

In [None]:
# Merge the DataFrames: o2o
o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

## 6.2 merge: many to one / one to many
- for duplicates in key, both DataFrames do not have unique keys for a merge 
- What happens here is that for each duplicated key, every pairwise combination will be created.

In [None]:
# Merge the DataFrames: m2o
m2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

## 6.3 merge: many to many

In [None]:
# Merge site and visited: m2m
m2m = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Merge m2m and survey: m2m
m2m = pd.merge(left=m2m, right=survey, left_on='ident', right_on='taken')

In [None]:
Image(filename='Images/multtimeseries.png')
SVG(filename='Images/multtimeseriesslice1.svg')

### 

### 

### 

### 

### 

### 

### 

### 

# 2. 

### 

### 

### 

### 

### 

### 

### 

### 

### 

### 

# 3. 

### 

### 

### 

### 

### 

### 

### 

### 

### 

### 

# 4. 

### 

### 

### 

### 

### 

### 

### 

### 

### 

### 