# Core Pandas

In [1]:
# series is a data structure that holds an array along with a named index
# this is what makes it so unique compared to a asimply numpy array

# the Panda series adds a label index to the array
# makes it much easier to grab data with meaningful labels

# the data is still numerically organized
# we can grab data with numerical or label index

In [2]:
import numpy as np
import pandas as pd

Creating panda series from lists

In [3]:
myindex = ['USA', 'Canada', 'Mexico']

In [4]:
mydata = [1776,1867,1821]

In [5]:
myser = pd.Series(data=mydata)

In [6]:
myser

0    1776
1    1867
2    1821
dtype: int64

In [7]:
type(myser)

pandas.core.series.Series

In [12]:
myser = pd.Series(data=mydata, index=myindex)
#as long as the args are passed in at the correct order, we don't need to add the labels
myser

USA       1776
Canada    1867
Mexico    1821
dtype: int64

In [13]:
myser[0] #access elements using the numeric index

1776

In [14]:
myser['USA'] #access elements using the labeled index

1776

Creating panda series from a dictionary

In [15]:
ages = {'Sam':5, 'Frank':10, 'Spike':7}

In [18]:
pd.Series(ages)
#pandas automatically makes the dic keys the label index and the value the data

Sam       5
Frank    10
Spike     7
dtype: int64

In [19]:
# Imaginary Sales Data for 1st and 2nd Quarters for Global Company
q1 = {'Japan': 80, 'China': 450, 'India': 200, 'USA': 250}
q2 = {'Brazil': 100,'China': 500, 'India': 210,'USA': 260}

In [20]:
sales_q1 = pd.Series(q1)
sales_q1

Japan     80
China    450
India    200
USA      250
dtype: int64

In [21]:
sales_q2 = pd.Series(q2)
sales_q2

Brazil    100
China     500
India     210
USA       260
dtype: int64

In [22]:
sales_q1['Japan']

80

In [23]:
sales_q1[0]

80

In [24]:
# how to see what our labeled index is
sales_q1.keys()

Index(['Japan', 'China', 'India', 'USA'], dtype='object')

In [27]:
[1,2] * 2 #operation doubles/dulpicates the list

[1, 2, 1, 2]

In [26]:
np.array([1,2]) * 2 #example of the oepration being broadcasted

array([2, 4])

In [28]:
sales_q1

Japan     80
China    450
India    200
USA      250
dtype: int64

In [29]:
sales_q2

Brazil    100
China     500
India     210
USA       260
dtype: int64

In [30]:
sales_q1 + sales_q2

Brazil      NaN
China     950.0
India     410.0
Japan       NaN
USA       510.0
dtype: float64

In [36]:
# using the pandas add method, we can deal with missing values
first_half = sales_q1.add(sales_q2, fill_value=0) #fill value is 0 b/c we didnt make any sales if data is missing

In [37]:
sales_q1.dtype #dtype is an attribute of panda series, not a method

dtype('int64')

In [38]:
first_half.dtype

dtype('float64')

# Pandas DataFrames

DataFrame is a table of rows and columns that we can easily filter/restructure

Formal Definition: group of Pandas Series objects that SHARE the same index
e.g. Index for countries USA, Canada, Mexico but a column for population, one for GDP, one for exports. Each of these columns are their own Series objects that we can group into one DataFrame since they share the same index

In [43]:
np.random.seed(101) # ensuring that we all have the same set of random numbers
mydata = np.random.randint(0,101,(4,3)) #nums [0,101) in 4x3 matrix
mydata

array([[95, 11, 81],
       [70, 63, 87],
       [75,  9, 77],
       [40,  4, 63]])

In [44]:
myindex = ['CA', 'NY', 'AZ', 'TX']

In [45]:
mycolumns = ['Jan', 'Feb', 'Mar']

In [46]:
# what happens if we only provide data and no col or index
# Pandas will auto generate an index for col and rows
df = pd.DataFrame(mydata)
df

Unnamed: 0,0,1,2
0,95,11,81
1,70,63,87
2,75,9,77
3,40,4,63


In [48]:
df = pd.DataFrame(data=mydata,index=myindex)
df

Unnamed: 0,0,1,2
CA,95,11,81
NY,70,63,87
AZ,75,9,77
TX,40,4,63


In [49]:
df = pd.DataFrame(mydata,myindex,mycolumns)
df

Unnamed: 0,Jan,Feb,Mar
CA,95,11,81
NY,70,63,87
AZ,75,9,77
TX,40,4,63


In [50]:
df.info() #gives us great info about the DataFrame

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to TX
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Jan     4 non-null      int32
 1   Feb     4 non-null      int32
 2   Mar     4 non-null      int32
dtypes: int32(3)
memory usage: 80.0+ bytes


Creating DataFrames from other files (csv, excel, sql, etc.)

In [54]:
# WHERE IS MY PYTHON CODE LOCATED?
#use pwd (Jupyter Notebook only, not python) to get the urrent working directory 

In [55]:
pwd

'C:\\Users\\iliaa\\Desktop\\Algorithmic Trading'

In [56]:
ls

 Volume in drive C is Windows
 Volume Serial Number is 9863-4AE9

 Directory of C:\Users\iliaa\Desktop\Algorithmic Trading

02/24/2023  02:28 PM    <DIR>          .
02/22/2023  07:38 PM    <DIR>          ..
02/09/2023  06:00 PM                66 .gitattributes
02/24/2023  01:19 PM    <DIR>          .ipynb_checkpoints
02/24/2023  01:17 PM            10,812 01-Python Crash Course Exercises-Copy1.ipynb
02/24/2023  01:17 PM            22,979 03-NumPy-Exercises-Copy1.ipynb
02/24/2023  02:28 PM            18,855 Core Pandas Notes04.ipynb
02/23/2023  11:56 AM            24,535 Notes02.ipynb
02/24/2023  12:53 PM            31,071 Notes03.ipynb
02/09/2023  06:00 PM                24 README.md
               7 File(s)        108,342 bytes
               3 Dir(s)  594,453,458,944 bytes free


In [60]:
ls

 Volume in drive C is Windows
 Volume Serial Number is 9863-4AE9

 Directory of C:\Users\iliaa\Desktop\Algorithmic Trading

02/24/2023  02:35 PM    <DIR>          .
02/22/2023  07:38 PM    <DIR>          ..
02/09/2023  06:00 PM                66 .gitattributes
02/24/2023  02:35 PM    <DIR>          .ipynb_checkpoints
02/24/2023  01:17 PM            10,812 01-Python Crash Course Exercises-Copy1.ipynb
02/24/2023  01:17 PM            22,979 03-NumPy-Exercises-Copy1.ipynb
02/24/2023  02:33 PM            18,676 constituents-Copy1.csv
02/24/2023  02:32 PM            73,270 constituents-financials-Copy1.csv
02/24/2023  02:33 PM                46 example-Copy1.csv
02/24/2023  02:33 PM             5,022 example-Copy1.xlsx
02/24/2023  02:33 PM        24,992,926 hotel_booking_data-Copy1.csv
02/24/2023  02:33 PM               171 movie_scores-Copy1.csv
02/24/2023  02:33 PM            17,727 mpg-Copy1.csv
02/24/2023  02:33 PM             5,022 my_excel_file-Copy1.xlsx
02/24/2023  02:33 PM          

In [67]:
df = pd.read_csv('tips.csv')

In [68]:
# providing the full file path
df2 = pd.read_csv("C:\\Users\\iliaa\\Desktop\\Algo Trading Course\\03-Core-Pandas\\tips.csv")

In [69]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17


Some helpful methods and attributes for DataFrames

In [71]:
df.columns #this is an attribute, NOT a method, no ()

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person', 'Payer Name', 'CC Number', 'Payment ID'],
      dtype='object')

In [72]:
df.index #another attribute, reports the index

RangeIndex(start=0, stop=244, step=1)

In [73]:
df.head() #method that displays the first few rows in the DataFrame

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [75]:
df.head(10) #can choose how many rows you want

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
5,25.29,4.71,Male,No,Sun,Dinner,4,6.32,Erik Smith,213140353657882,Sun9679
6,8.77,2.0,Male,No,Sun,Dinner,2,4.38,Kristopher Johnson,2223727524230344,Sun5985
7,26.88,3.12,Male,No,Sun,Dinner,4,6.72,Robert Buck,3514785077705092,Sun8157
8,15.04,1.96,Male,No,Sun,Dinner,2,7.52,Joseph Mcdonald,3522866365840377,Sun6820
9,14.78,3.23,Male,No,Sun,Dinner,2,7.39,Jerome Abbott,3532124519049786,Sun3775


In [76]:
df.tail() #method that shows the last 5 elements in the DataFrame

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.0,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.0,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672


In [77]:
df.info() #method that gives us valualbe info about the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   Payer Name        244 non-null    object 
 9   CC Number         244 non-null    int64  
 10  Payment ID        244 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 21.1+ KB


In [81]:
df.describe() # calculates basic descriptive statistics for each COLUMN

#sometimes these stats are not meaningful
# e.g. the mean of the credit card number, it's just a random sequence of nums
# that's stored as an int instead of a string

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.888197,2563496000000000.0
std,8.902412,1.383638,0.9511,2.914234,2369340000000000.0
min,3.07,1.0,1.0,2.88,60406790000.0
25%,13.3475,2.0,2.0,5.8,30407310000000.0
50%,17.795,2.9,2.0,7.255,3525318000000000.0
75%,24.1275,3.5625,3.0,9.39,4553675000000000.0
max,50.81,10.0,6.0,20.27,6596454000000000.0


For Better readability, we can use the transpose() method to switch the columns and rows

In [82]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.78594,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9510998,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.888197,2.914234,2.88,5.8,7.255,9.39,20.27
CC Number,244.0,2563496000000000.0,2369340000000000.0,60406790000.0,30407310000000.0,3525318000000000.0,4553675000000000.0,6596454000000000.0


## DataFrames, working with COLUMNS

In [83]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [86]:
df['total_bill']

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [88]:
type(df['total_bill']) # the type is panda series, makes sense b/c each column is it's own panda Series

pandas.core.series.Series

In [90]:
# How to get a list of columns
mycols=['total_bill','tip'] #assign a label to the list of columns 
df[mycols] #pass that into the DataFrame object

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [91]:
# We can do the above operation in 1 step
# NOTE: we need to pass in a LIST of columns
# df['total_bill','tip'] would not work b/c we're passing 2 strings separated by a comma
# the df doesnt take such param arguments

df[['total_bill','tip']]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


how to create a new column using 2 other columns

In [94]:
# calculating the tip percentage (assuming tip is part of the total bill)
100 * df['tip'] / df['total_bill']

0       5.944673
1      16.054159
2      16.658734
3      13.978041
4      14.680765
         ...    
239    20.392697
240     7.358352
241     8.822232
242     9.820426
243    15.974441
Length: 244, dtype: float64

In [95]:
#how do we create a new column
df['tip_percentage'] = 100 * df['tip'] / df['total_bill']

In [96]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,14.680765


In [97]:
# if you create a new column with the same name, Pandas will OVERRIDE the old data with the same name
df['price_per_person'] = df['total_bill'] / df['size']

In [101]:
# how to limit the number of decimals being displayed
df['price_per_person'] = np.round(df['total_bill'] / df['size'],4) #the 2 param arg means we want 4 decimal points

In [100]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,Christy Cunningham,3560325168603410,Sun2959,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230,Sun4608,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322,Sun4458,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221,Sun2251,14.680765


In [106]:
# how do we remove columns or rows
# we use the .drop() method
# df.drop(axis=0), will drop rows
# df.drop(axis=1), will drop columns
df.drop('tip_percentage',axis=1) #doesnt do it inplace unless we specify it as a param arg

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950,Sat17


In [108]:
df #the drop wasn't in place, we still have the tip_percentage column

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410,Sun2959,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230,Sun4608,16.054159
2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322,Sun4458,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994,Sun5260,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221,Sun2251,14.680765
...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842,Sat2657,20.392697
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404,Sat1766,7.358352
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196,Sat3880,8.822232
242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950,Sat17,9.820426


In [109]:
df = df.drop('tip_percentage',axis=1) #dropping the column and pointing it back to the original label 
# will achieve the same thing as using the inplace arg
# this is the recommended way
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950,Sat17


# DataFrame, working with ROWS

In [110]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221,Sun2251


In [117]:
df.index

Index(['Sun2959', 'Sun4608', 'Sun4458', 'Sun5260', 'Sun2251', 'Sun9679',
       'Sun5985', 'Sun8157', 'Sun6820', 'Sun3775',
       ...
       'Sat7220', 'Sat4615', 'Sat5032', 'Sat2929', 'Sat9777', 'Sat2657',
       'Sat1766', 'Sat3880', 'Sat17', 'Thur672'],
      dtype='object', name='Payment ID', length=244)

In [118]:
# how do we set our index to be one of our column values
# our index needs to be a unique identifier for most things (e.g. ML)
df.set_index('Payment ID') #the col 'Payment ID' is now row, and its col is removed

# df.set_index is not inplace, unless we reassign it such as below

KeyError: "None of ['Payment ID'] are in the columns"

In [119]:
df

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...
Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...
Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [122]:
# how to reset the index
df.reset_index() #takes no argument
# it will take whatever is the index and turn it into a col

Unnamed: 0,Payment ID,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
0,Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
1,Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
2,Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
3,Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
4,Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...,...
239,Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
240,Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
241,Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
242,Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [123]:
df

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...
Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [125]:
df = df.reset_index()
df

Unnamed: 0,index,Payment ID,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
0,0,Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
1,1,Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
2,2,Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
3,3,Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
4,4,Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...,...,...
239,239,Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
240,240,Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
241,241,Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
242,242,Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [126]:
df.set_index('Payment ID')

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun2959,0,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...,...
Sat2657,239,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
Sat1766,240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [127]:
df

Unnamed: 0,index,Payment ID,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
0,0,Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
1,1,Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
2,2,Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
3,3,Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
4,4,Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...,...,...
239,239,Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
240,240,Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
241,241,Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
242,242,Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [130]:
df = df.set_index("Payment ID")

KeyError: "None of ['Payment ID'] are in the columns"

In [131]:
df

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun2959,0,16.99,1.01,Female,No,Sun,Dinner,2,8.4950,Christy Cunningham,3560325168603410
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
...,...,...,...,...,...,...,...,...,...,...,...
Sat2657,239,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
Sat1766,240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [133]:
# how we grab a single row based on the numeric index
df.iloc[0] # returns a row based on the numeric index

index                                0
total_bill                       16.99
tip                               1.01
sex                             Female
smoker                              No
day                                Sun
time                            Dinner
size                                 2
price_per_person                 8.495
Payer Name          Christy Cunningham
CC Number             3560325168603410
Name: Sun2959, dtype: object

In [134]:
# how we grab a single row based on the labeled index
df.loc['Sun2959']

index                                0
total_bill                       16.99
tip                               1.01
sex                             Female
smoker                              No
day                                Sun
time                            Dinner
size                                 2
price_per_person                 8.495
Payer Name          Christy Cunningham
CC Number             3560325168603410
Name: Sun2959, dtype: object

In [136]:
# how we do grab multiple rows using numeric values
df.iloc[0:4] # we can use python list slicing notation

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun2959,0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,Christy Cunningham,3560325168603410
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.5,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994


In [137]:
# grabbing multiple rows using labeled index
df.loc[['Sun2959','Sun5260']] # pass in a LIST of labeled index to display

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun2959,0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,Christy Cunningham,3560325168603410
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994


In [140]:
# removing rows
# we use the same .drop() method with the param arg axis=0 to drop rows
df.drop('Sun2959',axis=0) #drops the first row
#this change is not in place, it is not permanent

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
Sun9679,5,25.29,4.71,Male,No,Sun,Dinner,4,6.3225,Erik Smith,213140353657882
...,...,...,...,...,...,...,...,...,...,...,...
Sat2657,239,29.03,5.92,Male,No,Sat,Dinner,3,9.6767,Michael Avila,5296068606052842
Sat1766,240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950


In [141]:
df.head()

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun2959,0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,Christy Cunningham,3560325168603410
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.5,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun2251,4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221


In [142]:
df = df.drop('Sun2959',axis=0)

In [143]:
df.head()

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.5,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun2251,4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
Sun9679,5,25.29,4.71,Male,No,Sun,Dinner,4,6.3225,Erik Smith,213140353657882


In [145]:
one_row = df.iloc[0]
one_row

index                              1
total_bill                     10.34
tip                             1.66
sex                             Male
smoker                            No
day                              Sun
time                          Dinner
size                               3
price_per_person              3.4467
Payer Name            Douglas Tucker
CC Number           4478071379779230
Name: Sun4608, dtype: object

In [147]:
# how to add a row to a DataFrame
df = df.append(one_row)

  df = df.append(one_row)


In [148]:
df

Unnamed: 0_level_0,index,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun4608,1,10.34,1.66,Male,No,Sun,Dinner,3,3.4467,Douglas Tucker,4478071379779230
Sun4458,2,21.01,3.50,Male,No,Sun,Dinner,3,7.0033,Travis Walters,6011812112971322
Sun5260,3,23.68,3.31,Male,No,Sun,Dinner,2,11.8400,Nathaniel Harris,4676137647685994
Sun2251,4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832732618637221
Sun9679,5,25.29,4.71,Male,No,Sun,Dinner,4,6.3225,Erik Smith,213140353657882
...,...,...,...,...,...,...,...,...,...,...,...
Sat1766,240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.5900,Monica Sanders,3506806155565404
Sat3880,241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.3350,Keith Wong,6011891618747196
Sat17,242,17.82,1.75,Male,No,Sat,Dinner,2,8.9100,Dennis Dixon,4375220550950
Thur672,243,18.78,3.00,Female,No,Thur,Dinner,2,9.3900,Michelle Hardin,3511451626698139
