Import pandas (and numpy) with:

In [None]:
import pandas as pd 
import numpy as np

## Pandas Series
Pandas Series are like table columns, a 1D array.

In [None]:
series = [3, 1, 7, 0]
pd_series = pd.Series(series)
print(pd_series)
# Result:
# 0    3
# 1    1
# 2    7
# 3    0
# dtype: int64

# pd_series[0] returns 3

0    3
1    1
2    7
3    0
dtype: int64


Create labels with the index argument:

In [None]:
my_index = ["A", "B", "C", "D"]
pd_series2 = pd.Series(series, my_index)
print(pd_series2)
# Result:
# A    3
# B    1
# C    7
# D    0
# dtype: int64

# pd_series2["A"] returns 3

Create a Pandas Series from a Python dictionary:

In [None]:
cars = {"Audi": 4, "BMW": 3, "Toyota": 6}
pd_series3 = pd.Series(cars)
print(pd_series3)
# Result:
# Audi      4
# BMW       3
# Toyota    6
# dtype: int64

# pd_series3["Audi"] returns 4

## Pandas Dataframes
Creating a random dataframe:

In [1]:
import pandas as pd
import numpy as np
from numpy.random import randn

my_data = randn(4, 3) # Rows, Columns
my_rows = ["A", "B", "C", "D"]
my_cols = ["Monday", "Tuesday", "Wednesday"]

# Create Dataframe
my_df = pd.DataFrame(my_data, my_rows, my_cols)
my_df

Unnamed: 0,Monday,Tuesday,Wednesday
A,-0.289075,0.020548,1.067276
B,0.01486,-0.569779,0.141372
C,1.685028,0.695544,0.630808
D,1.141039,0.208812,-0.738063


Importing dataframes and some pandas functions:

In [None]:
my_df2 = pd.read_csv(...)

# Pull out a row:
my_df2.loc[0] # Series for the 1st row

# Pull out multiple rows
my_df2.loc[[0, 5]] # Dataframe containing the 1st and 6th rows

# Grab the first 9 rows
my_df2.head(9) # If no parameter is given, then head works for the first 5 rows

# Grab the last 9 rows
my_df2.tail(9) # If no parameter is given, then tail works for the last 5 rows

# Get info about the dataframe
my_df2.info()

# Get shape of rows and columns
my_df2.shape

# Get number of dimensions
my_df2.ndim

# Get column datatypes
my_df2.dtypes

# Get statistics about the data
my_df2.describe()
my_df2[...].describe() # Get statistics about a specific column

# Select specific column
my_df2[...]         # Using brackets
my_df2._            # Using dot notation
my_df2.iloc[:, ...]    # Using location

Count data in a dataframe:

In [None]:
# Count distinct values in a column
my_df2[...].value_counts()                  # descending
my_df2[...].value_counts(ascending = True)  # ascending
my_df2[...].value_counts(dropna = False)    # include NaN values
my_df2[...].value_counts(normalize = True)  # relative frequency
my_df2[...].value_counts()[...]             # get specific item count in the second bracket

# Other ways to count unique values
my_df2.groupby(...).size()                  # count unique values - size
my_df2.groupby(...).count()                 # count unique values - count

# Get a count of all columns across all columns
my_df2.apply(pd.value_counts)

Add new columns to a dataframe:

In [None]:
# Add column from list:
gender = [...]
my_df2["Gender"] = gender # List length as to be the same as the number of rows
# Use np.nan to allow NaN values to be inserted as a column

my_df2.insert(1, "...", ..., True) 
# Second parameter is the column name
# Third parameter is the list
# Fourth parameter is whether to allow duplicate values

# Add column with .assign() - creates a new dataframe
my_df3 = my_df2.assign(...) # parameter is the list to add

Removing rows and columns from a dataframe:

In [None]:
# Remove column
my_df2.drop("Gender", axis = 1, inplace = True) 
# Designate the column you want dropped
# axis = 1 for columns
# inplace = True for deleting a column permanently

# Remove row
my_df2.drop(3, axis = 0, inplace = True) 
# Designate the row you want dropped (Number for the row index)
# axis = 0 for rows
# inplace = True for deleting a column permanently

Grab Rows, Points and Subsets from a dataframe:

In [None]:
my_df2.loc[...] # using loc (indexing)
my_df2.iloc[...] # using iloc (position)
my_df2.loc[..., ...] # get point
# for subsets, you can pass lists of indices into loc, or lists of positions into iloc

For conditional selections, you can pass in conditional statements as indices into a pandas dataframe to return column(s) that display the original data if the condition is satisfied, and NaN if it isn't.

Additional Note: & for and, and | for or.

Changing and Resetting Indices:

In [None]:
my_df2["..."] = [...]
my_df2.set_index("...", inplace = True) # set new index, inplace = True for permanent change
my_df2.reset_index(inplace = True) # reset the index, inplace = True for permanent change

Dealing with Incomplete Data:

In [6]:
import pandas as pd
import numpy as np

# create dummy data
stuff = {"A": [1, 2, 3], "B": [4, np.nan, np.nan], "C": [7, 8, 9], "D": [10, 11, 12]}
my_df = pd.DataFrame(stuff)
my_df

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,,8,11
2,3,,9,12


In [7]:
# drop rows with null data
my_df.dropna() # inplace = True for permanent change

Unnamed: 0,A,B,C,D
0,1,4.0,7,10


In [8]:
# drop columns with null data
my_df.dropna(axis = 1) # inplace = True for permanent change

Unnamed: 0,A,C,D
0,1,7,10
1,2,8,11
2,3,9,12


In [None]:
# set threshold
my_df.dropna(thresh = 1, axis = 1) # if thresh = 2, then B is removed

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,,8,11
2,3,,9,12


In [11]:
# replace nulls with fillna()
my_df.fillna(value = my_df["B"].mean())

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,4.0,8,11
2,3,4.0,9,12
