# Kevin's Pandas' Crib Sheet

This is a consolidation notes and examples from:
> Coreys MSchafer's Pandas videos [here](https://www.youtube.com/playlist?list=PL-osiE80TeTsWmV9i9c58mdDCSskIFdDS) 

and 
> Hands on Data Analysis by xxx

Version 2.0W

## 0. Set-up 

In [39]:
import pandas as pd
import numpy as np
import datetime as dt
import pprint

In [40]:
people = {
    'first': ['Corey', 'Jane', 'Janey', 'John', 'Jimmy'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Doe', 'Doe'], 
    'email': ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JaneyDoe@email.com','JohnDoe@email.com', 'JimmyDoe@email.com']
}
print(f'{people=}')
# print(people)

people={'first': ['Corey', 'Jane', 'Janey', 'John', 'Jimmy'], 'last': ['Schafer', 'Doe', 'Doe', 'Doe', 'Doe'], 'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JaneyDoe@email.com', 'JohnDoe@email.com', 'JimmyDoe@email.com']}


## 1. Quick Overview of the Data 

In [41]:
df = pd.DataFrame(people)
# df.info()             # Overview of the dataframe
# df.columns            # List column names
# df.describe()           # Quick summart of the frame, best for wide format.


## 2. Indexes


In [None]:
# Set a new index and keep it set with inplace.  
# Indexes don't have to be unique (?!)
df.set_index('email', inplace=True)     # Set a column to be an index
df.reset_index(inplace=True)            # Reset row indexes to (hand to 'save'a column used a an index)
df.index

RangeIndex(start=0, stop=5, step=1)

## 3. Accessing Data 

In [44]:
# df                    # Simple access
# df['email']           # Access single column
# df[['last', 'email']]   # Access multiple columns by using a list (a list within the list)i
# df.iloc[[0, 1], 2]      # Access by integer reference / index by using .iloc.  .loc and iloc takes row index first
# df.loc[[0, 1], ['email', 'last']] # As above 


## X. Working 

In [49]:
# Access by text index name and column name using .loc
df.loc['CoreyMSchafer@gmail.com', 'last']

'Schafer'

In [51]:
# Lesson #4 Filtering

# 2 part process to filtering.  Set filter, then use filter
# But can't use 'filter' as a variable name it's reserved 
# Then apply with using .loc
filt = (df['last'] == 'Schafer') | (df['first'] == 'John') #exmple of an 'or' '|' filter
df.loc[filt, 'email']

0    CoreyMSchafer@gmail.com
3          JohnDoe@email.com
Name: email, dtype: object

In [52]:
# Or inverse of filter with .loc
df.loc[~filt, 'email']

1     JaneDoe@email.com
2    JaneyDoe@email.com
4    JimmyDoe@email.com
Name: email, dtype: object

In [53]:
# Lesson #05 Updating 

# Rename all columns 
df.columns = ['email', 'first_name', 'last_name']
df

Unnamed: 0,email,first_name,last_name
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JaneyDoe@email.com,Janey,Doe
3,JohnDoe@email.com,John,Doe
4,JimmyDoe@email.com,Jimmy,Doe


In [54]:
# Rename specific columns using .rename
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JaneyDoe@email.com,Janey,Doe
3,JohnDoe@email.com,John,Doe
4,JimmyDoe@email.com,Jimmy,Doe


In [55]:
# Rename all columns by an inline comprehension .columns
df.columns = [x.upper() for x in df.columns]
df
df.columns = [x.lower() for x in df.columns] #reset for later


In [56]:
# Update whole row with .loc
df.loc[3] = ['John2Smith@email.com', 'John2', 'Smith'] 
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JaneyDoe@email.com,Janey,Doe
3,John2Smith@email.com,John2,Smith
4,JimmyDoe@email.com,Jimmy,Doe


In [57]:
# update specific columns of a row with .loc
df.loc[2, ['last', 'email']] = ['Smith', 'janeysmith@email.com']
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,janeysmith@email.com,Janey,Smith
3,John2Smith@email.com,John2,Smith
4,JimmyDoe@email.com,Jimmy,Doe


In [58]:
# Update cells based on a filter with .loc
filt = (df['email'] == 'John2Smith@email.com')
# df[filt]['last'] = 'Smith'    # DON'T do this, it won't work
df.loc[filt, 'first'] = 'Johnny'  # THIS will
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,janeysmith@email.com,Janey,Smith
3,John2Smith@email.com,Johnny,Smith
4,JimmyDoe@email.com,Jimmy,Doe


In [59]:
# Update whole column with string object method with.str.x
df['email'] = df['email'].str.lower()
df

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,janeysmith@email.com,Janey,Smith
3,john2smith@email.com,Johnny,Smith
4,jimmydoe@email.com,Jimmy,Doe


In [60]:
# 4 Functions: apply, map, applymap & replace  
# 'Apply' a function to an object and get a series as a result
# Object can be a series (by defauly a column) 
# Object can be a dataframe in which case it's applied to each series (column) for a single result for each
# Use df['email'] = to apply to actual data frame 
df['email'].apply(len)

0    23
1    17
2    20
3    20
4    18
Name: email, dtype: int64

In [61]:
# 'Apply' your own function
# Use df['email'] = to apply to actual data frame 
def update_email(email):
    return email.upper()
df['email'].apply(update_email) 

0    COREYMSCHAFER@GMAIL.COM
1          JANEDOE@EMAIL.COM
2       JANEYSMITH@EMAIL.COM
3       JOHN2SMITH@EMAIL.COM
4         JIMMYDOE@EMAIL.COM
Name: email, dtype: object

In [62]:
# 'Apply' a your own inline (LAMBDA) function to a whole column and get a series as a result
# Use df['email'] = to apply to actual data frame f['email'].apply(lambda x: x.lower())
df['email'].apply(lambda x: x.lower())

0    coreymschafer@gmail.com
1          janedoe@email.com
2       janeysmith@email.com
3       john2smith@email.com
4         jimmydoe@email.com
Name: email, dtype: object

In [63]:
# When applied to a dataframe 'apply' is applied across each series
# len(df['email']) # email just an example
df.apply(len) #or df.apply(len, axis='columns') or df.apply(len, axis='rows')   

email    5
first    5
last     5
dtype: int64

In [64]:
# Returns the minimum (first in alaphs) in each column
df.apply(pd.Series.min)


email    coreymschafer@gmail.com
first                      Corey
last                         Doe
dtype: object

In [65]:
# Applying a Lambda function to each series
df.apply(lambda x: x.min())

email    coreymschafer@gmail.com
first                      Corey
last                         Doe
dtype: object

In [66]:
# applymap works element wise.  Only for dataframes, not series 
df.applymap(len)

Unnamed: 0,email,first,last
0,23,5,7
1,17,4,3
2,20,5,5
3,20,6,5
4,18,5,3


In [67]:
# Another example for strings 
df.applymap(str.lower)

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,corey,schafer
1,janedoe@email.com,jane,doe
2,janeysmith@email.com,janey,smith
3,john2smith@email.com,johnny,smith
4,jimmydoe@email.com,jimmy,doe


In [68]:
# .map only works on a series. Use like a vlookup
# Use it to subsitute one value for another via a lookup dictionary.
# Unsubtituted vales replaced by NaN
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
3      NaN
4      NaN
Name: first, dtype: object

In [69]:
# .replace works like map but leaves unsubsittuted values untouched (not NaN)
df['first'] = df['first'].replace({'Corey': 'Corey2', 'Jane': 'Jane2'})
df

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Corey2,Schafer
1,janedoe@email.com,Jane2,Doe
2,janeysmith@email.com,Janey,Smith
3,john2smith@email.com,Johnny,Smith
4,jimmydoe@email.com,Jimmy,Doe


In [70]:
# Lesson 6 Add & Remove Rows and Columns

# Creating a new column with strings, can use numeric as well with .apply 
# Can't use . notation as pandas would look for methd
df['full_name'] = df['first'] + ' ' + df['last']
df

Unnamed: 0,email,first,last,full_name
0,coreymschafer@gmail.com,Corey2,Schafer,Corey2 Schafer
1,janedoe@email.com,Jane2,Doe,Jane2 Doe
2,janeysmith@email.com,Janey,Smith,Janey Smith
3,john2smith@email.com,Johnny,Smith,Johnny Smith
4,jimmydoe@email.com,Jimmy,Doe,Jimmy Doe


In [71]:
# Remove columns with .drop like a db
df.drop(columns=['first', 'last'], inplace=True)
df

Unnamed: 0,email,full_name
0,coreymschafer@gmail.com,Corey2 Schafer
1,janedoe@email.com,Jane2 Doe
2,janeysmith@email.com,Janey Smith
3,john2smith@email.com,Johnny Smith
4,jimmydoe@email.com,Jimmy Doe


In [72]:
# Split data with str.split 
# splits on space by default so not needed
df['full_name'].str.split(' ', expand=True)
# would give list by default, need expand=True to make 2 new columns in dataframe

Unnamed: 0,0,1
0,Corey2,Schafer
1,Jane2,Doe
2,Janey,Smith
3,Johnny,Smith
4,Jimmy,Doe


In [73]:
# Create multiple columns at once 
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe


In [74]:
# Adding a single row with .append
# df.append({'first': 'Tony'}, ignore_index=True)
# insert new row even if no index given: ignore_index=True

# Above from video now deprecated, so my method below:
df2 = pd.DataFrame({'first': ['Tony']})
pd.concat([df, df2])

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe
0,,,Tony,


In [75]:
# Setup for adding a whole new dataframe
people = {
    'first': ['Tony', 'Steve'], 
    'last': ['Stark', 'Rogers'], 
    'email': ['IronMan@avenge.com', 'Cap@avenge.com']
}
df2 = pd.DataFrame(people)
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [76]:
# Adding a whole new dataframe
df = df.append(df2, ignore_index=True, sort=False)
df

  df = df.append(df2, ignore_index=True, sort=False)


Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe
5,IronMan@avenge.com,,Tony,Stark
6,Cap@avenge.com,,Steve,Rogers


In [77]:
# Deleteing a row with .drop
df.drop(index=6, inplace=True)
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe
5,IronMan@avenge.com,,Tony,Stark


In [78]:
# Deleting rows based on values 
filt = df['last'] == 'Stark'
df.drop(index=df[filt].index, inplace=True)
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe


In [79]:
#Lesson 7 Sorting Data


In [80]:
# Sort a dataframe by a single column with sort_values
df.sort_values(by='last', ascending=False)

Unnamed: 0,email,full_name,first,last
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe


In [81]:
# Sort a dataframe by a multiple columnsin a list with .sort_values
df.sort_values(by=['last', 'first'], ascending=False)

Unnamed: 0,email,full_name,first,last
3,john2smith@email.com,Johnny Smith,Johnny,Smith
2,janeysmith@email.com,Janey Smith,Janey,Smith
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe
1,janedoe@email.com,Jane2 Doe,Jane2,Doe


In [82]:
# Sort a dataframe by a multiple columns in a list with .sort_values 
# and different asending attrbutes from a list and make perm with inpace 
df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)
df

Unnamed: 0,email,full_name,first,last
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe


In [83]:
# Reset the order based on teh "original" index with .sort_index
df.sort_index()

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer
1,janedoe@email.com,Jane2 Doe,Jane2,Doe
2,janeysmith@email.com,Janey Smith,Janey,Smith
3,john2smith@email.com,Johnny Smith,Johnny,Smith
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe


In [84]:
# Sort a series (column) with .sort_values 
df['last'].sort_values()

1        Doe
4        Doe
0    Schafer
2      Smith
3      Smith
Name: last, dtype: object

In [85]:
# Lessson 8 Aggregates
# Corey uses large data set, I'm just adding extra numeric colums to the existing one.
df['numeric_data_01'] = np.random.randint(0,100, size=len(df))
df['numeric_data_02'] = np.random.randint(0,100, size=len(df))
df


Unnamed: 0,email,full_name,first,last,numeric_data_01,numeric_data_02
2,janeysmith@email.com,Janey Smith,Janey,Smith,30,70
3,john2smith@email.com,Johnny Smith,Johnny,Smith,80,61
0,coreymschafer@gmail.com,Corey2 Schafer,Corey2,Schafer,33,83
1,janedoe@email.com,Jane2 Doe,Jane2,Doe,44,21
4,jimmydoe@email.com,Jimmy Doe,Jimmy,Doe,47,52


In [86]:
# Use aggregation functuins, such as mean, mode, standard deviation etc on a simgle column
df[['numeric_data_01', 'numeric_data_02']].median()

numeric_data_01    44.0
numeric_data_02    61.0
dtype: float64

In [87]:
# Or get anb overview of all numeric columns with .describe
df.describe()

Unnamed: 0,numeric_data_01,numeric_data_02
count,5.0,5.0
mean,46.8,57.4
std,19.89221,23.351659
min,30.0,21.0
25%,33.0,52.0
50%,44.0,61.0
75%,47.0,70.0
max,80.0,83.0


In [88]:
# Count the number of populated fields in a column with .count
df['last'].count()

5

In [89]:
# Count the number of eachvalue with .value_counts 
df['last'].value_counts()

Smith      2
Doe        2
Schafer    1
Name: last, dtype: int64

In [90]:
# Or to get a percentage use the normalise=True attribute
df['last'].value_counts(normalize=True)*100

Smith      40.0
Doe        40.0
Schafer    20.0
Name: last, dtype: float64

In [93]:
# Groups
# Create a group in a similar way as we created a filter, but with .groupby([column_name])
# This gives you a group object, indexed by the group rather than true / galse list of a filter
grp_last = df.groupby(['last'])
grp_last.groups                # KT added to see groups and indexes

{'Doe': [1, 4], 'Schafer': [0], 'Smith': [2, 3]}

In [None]:
# Then apply methods to the group in a 2nd step, e.g., .get_group 
grp_last.get_group('Smith')

In [None]:
# Apply a function (.value_counts) to a column after already being grouped
# Can filter furtther with .loc makes it loke usiong a filter
# Can also get percentage like above with (normalize=True)*100
grp_last['first'].value_counts() #.loc['Smith']

In [None]:
# Can retrive multiple columns and perform other aggregate functions with their methods 
grp_last[['numeric_data_01', 'numeric_data_02']].median() #.loc[['Smith' , 'Doe']]


In [None]:
# *** Or use more generic form to apply multiple aggregated functions with .agg ***
# Seems most generic to me!!!
grp_last[['numeric_data_01', 'numeric_data_02']].agg(['count', 'mean', 'std']) #.loc[['Smith' , 'Doe']]

In [None]:
# Counting rows with filter.  Counts true's in the returned series with .sum
filt = df['last'] == 'Doe'
df.loc[filt]['first'].str.contains('Jane').sum()

In [None]:
# But fora group need to .apply the function to all the group's series 
grp_last['first'].apply(lambda x: x.str.contains('n').sum())

In [None]:
# How to find the percentage with an n in their first name and group by surname

# Create a series of the number of people with each surname
surname_count = df['last'].value_counts()
surname_count

# Create a series of people with each surname, with 'n' in first name
surname_count_with_n = grp_last['first'].apply(lambda x: x.str.contains('n').sum())
surname_count_with_n

# Merge the 2 series togther, add and calculate the percentage (answer column) and tidy up column names
df_with_n = pd.concat([surname_count, surname_count_with_n], axis='columns', sort=False)
df_with_n['percentage'] = df_with_n['first']/df_with_n['last']*100
df_with_n.rename(columns={'first': 'First_with_an_n', 'last': 'Surname'}, inplace=True)
df_with_n.sort_values('percentage', ascending=False)
# df_with_n.loc['Smith']

In [None]:
# Lesson 9 Cleaning Data

# Set-up some dirty data  
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}
df = pd.DataFrame(people)
df
# GOOD IDEA look for unique values in columns to see if you're likely to get problems 
# for i in df.columns:
#     print(f'\n{i}')
#     print(df[i].unique())


In [None]:
# Identify na values (by getting a mask) rather than drop them with .isna
df.isna()
# or
df.isna().sum()

In [None]:
# Cleaning. Replace unusual nill values across whole data frame
# Could do all this at import time for csv pd.read_csv(XXXXX..., na_values=['NA','Missing'])
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
df

In [None]:
# Cleaning. Replaces NaN values with an actual value.  Most usful for NUMERIC data
df.fillna(0)

In [None]:
# Drop any / all rows that aren't totally complete with .dropna & how = 'any'
# default values are: df.dropna(axis='index', how='any')
df.dropna()

In [None]:
# Drop incomplete columns.  Which is all of them due to row 4
df.dropna(axis='columns')

In [None]:
# Drop rows that have missing data in either ('any') specified rows with how='' & subset=[]
df.dropna(axis='index', how='any', subset=['last', 'email'])

In [None]:
# Identify if data type is correct.  If numeric are wrong many aggrate functions won't work 
df.dtypes

In [None]:
# Cleaning. Casting a column to the correct data type with .astype
# Can use .astype on whole dataframe too.
# Use float not int, as NaN is a float.
df['age'] = df['age'].astype(float)
df.dtypes
# df['age'].mean()

In [None]:
# Lesson 10 - Date Time Series
# Can use ,format= if .to_datetime doesn't auto recognise the date / time format
df = pd.read_csv('time_series.csv')
df['Date']=pd.to_datetime(df['Date'], format='%Y-%m-%d %I-%p')
df.dtypes

In [None]:
# To find day name for single cell with .day_name() method
df.loc[0, 'Date'].day_name()

In [None]:
# New column comtaining day name with .dt.day_name()
df['DayOfWeek'] = df['Date'].dt.day_name()
df

In [None]:
# Some date functions
print(df['Date'].min())
print(df['Date'].max())
print(df['Date'].max() - df['Date'].min()) # Known as time delta

In [None]:
# Filtering on date range in str converted to a datetime with .to_datetime
filt = (df['Date'] >= pd.to_datetime('2019-01-01')) & (df['Date'] < pd.to_datetime('2020-01-01'))
df.loc[filt]

In [None]:
# Setting date column as an index for later functions
df.set_index('Date', inplace=True)

In [None]:
# Single value slice on index with .loc
df.loc['2019']

In [None]:
# Slice on index with .loc and for range :
df.loc['2020-01':'2020-02']

In [None]:
# Get an aggregate value (eg mean or max) of a column sliced by date 
print(  df.loc['2020-01':'2020-02']['Close'].mean() )
print(  df.loc['2020-01-01']['High'].max()  )

In [None]:
# Resample (downsample) a range using 'D' for day and .resample
highs = df['High'].resample('D').max()
highs

In [None]:
# Quick line plot with mathplot & a Magic command needed for Jupyter notebook
%matplotlib inline 
highs.plot()

In [None]:
# Resample whole dataframe with single aggregation method
df.resample('W').mean()

In [None]:
# Resample whole dataframe with diferent aggregations with a map & .agg method
df.resample('W').agg({'Close': 'mean', 'High': 'max', 'Low': 'min', 'Volume': 'sum'})

In [None]:
# Lesson 11: Reading and Writing to Sources

In [None]:
# Working with csv's 
df = pd.read_csv('time_series.csv', index_col='Date') # Load in the csv

filt = (df['Volume'] > 1_000_000)                           # Do some processing
df_big_trade_days =  df.loc[filt]                           # Do some processing
# 
df_big_trade_days.to_csv('output.csv')                      # Save as csv 
df_big_trade_days.to_csv('output.tsv', sep='\t')            # Save as csv with tab seperators 

In [None]:
# Working with Exel with .to_excel and read_excel
# conda install xlwt openpyxl xlrd 
df_big_trade_days.to_excel('output.xlsx')                   # Saving a dataframe to Excel.  Can use sheet arg & row & column 
df_excel = pd.read_excel('output.xlsx' , index_col='Date')  # Loading in from Excel
df_excel

In [None]:
# Working with json with .to_json and read_json
df_big_trade_days.to_json('output.json', orient='records', lines=True)
# Make records /list like rather than dictionary like with: orient='records' 
# Make each record a new line with lines=True' 
df_json = pd.read_json('output.json', orient='records', lines=True)
df_json

In [None]:
# Working with SQL
# Set up database 
# from sqlalchemy import create_engine
# import psycopg2
# engine = create_engine('postgresql://dbuser:dbpass@localhost:5432/sample_db')
# df.to_sql('sample_table', engine, if_exists='replace')
# sql_df = pd.read_sql('sample_table', engine, index_col='Respondent')
# sql_df = pd.read_sql_query('SELECT * FROM sample_table', engine, index_col='Respondent')
# # sql_df.head()


In [None]:
# Can read directly from a URL
posts_df = pd.read_json('https://raw.githubusercontent.com/CoreyMSchafer/code_snippets/master/Python/Flask_Blog/snippets/posts.json')
posts_df.head()

In [None]:
posts_df.head()