In [1]:
import pandas as pd

## Reading files, Select columns and Summarizing

In [None]:
# read csv file from local or from a URL
df = pd.read_csv('local_path/file.csv')
df = pd.read_csv('https://file_path/file.csv')

In [None]:
# read in a tables, can specify separators
df = pd.read_table('https://file_path/file', sep='|', index_col='column_x')

In [None]:
# Examine the df data
df                                        #print the first 30 and last 30 rows
type(df)                                  #type dataframe
df.head()                                 #print the 1st 5 rows
df.head(10)                               #print the 1st 10 rows
df.tail()                                 #print the last 5 rows
df.tail(10)                               #print the last 10 rows
df.index                                  #show the index column
df.columns                                #show all columns names in an array
df.dtypes                                 #show data types of each column
df.shape                                  #show the shape of the dataframe(number of rows and columns)
df.values                                 #show all the values of the dataframe(show as a numpy array for efficiencies)

In [None]:
# Select a column
df['column_x']                            #select specified column
type(df['column_x'])                      #show the datatype of specified column (e.g Series)
df.column_x                               #select the specified column using the dataframe attribute

In [None]:
# Summarize(describe) a dataframe
df.describe()                             #describe all numeric columns
df.describe(include=['object'])           #describe all object columns
df.describe(include='all')                #describe all columns

In [None]:
# Summarize a Series
df.column_x.describe()                    #describe a single column
df.column_x.mean()                        #only calculate the mean of the specified column
df['column_x'].mean()                     #alternative method to calculate the mean of the specified column

In [None]:
# Count the number of occurences of each value
df.column_x.value_counts()                #most useful for categorical variables, but can also be used with numeric column

In [None]:
# Filter dataframe by specified column, and print out values of another column
df[df.column_x == 'string_value'].column_y
df[df.column_x == 1].column_y

In [None]:
# Display only the number of rows of the dataframe
df.shape[0]
# Display only the number of column of the dataframe
df.shape[1]

In [None]:
# Display the 3 most frequent occurances of column in dataframe
df.column_x.value_counts()[0:3]

## Filtering and Sorting

In [None]:
# Boolean filtering: only show df with column_x < 20
filter_bool = df.columns_x < 20                       #create a series of booleans with condition <20 of a specified column...
df[filter_bool]                                       #..and use that series to filter rows
df[filter_bool].describe()                            #describe a dataframe filtered by filter_bool
df[df.column_x < 20]                                  #or, combine into a single step
df[df.column_x < 20].column_y                         #select one column from the filtered results of column_x
df[df['column_x'] < 20].column_y                      #alternative method
df[df.column_x < 20].column_y.value_counts()          #value_counts of result series, can also use mean() instead of value_counts()

In [None]:
# Boolean filtering with multiple conditions
df[(df.column_x < 20) & (df.column_y == 'string')]    #ampersand for AND condition
df[(df.column_x < 20) | (df.column_y > 60)]           #pipe for OR condition

In [None]:
# Sorting
df.column_x.order()                                   #sort a column
df.sort_values('column_x')                            #sort a dataframe by a single column
df.sort_values('column_x', ascending=False)           #use descending order instead

In [None]:
# Sort dataframe by multiple columns
df = df.sort(['col1','col2','col3'], ascending=[1,1,0])  

In [None]:
# Can also filter dataframe using pandas.Series.isin
df[df.column_x.isin(['string_1','string_2'])]

## Renaming, Adding and Removing columns

In [None]:
# Rename one or more columns
df.rename(columns={'original_column_1':'column_x','original_column_2':'column_y'}, inplace=True)   #Saves changes

In [None]:
# Replace all column names (in place)
new_cols = ['column_x','column_y','column_z']
df.columns = new_cols

In [None]:
# Replace all column names when reading the file
df = pd.read_csv('df.csv', header=0, names=new_cols)

In [None]:
# Add a new column as a function of existing columns
df['new_column_1'] = df.column_x + df.column_y
df['new_column_2'] = df.column_x * 1000           #can create new columns without for loops

In [None]:
# Removing columns
df.drop('column_x', axis=1)                              # axis=0 for rows, axis=1 for columns - does not drop in place
df.drop(['column_x','column_y'], axis=1, inplace=True)   # drop multiple columns

In [None]:
# Lower_case and upper_case all datafram column names
df.columns = map(str.lower, df.columns)
df.columns = map(str.upper, df.columns)

In [None]:
# Even more fancy dataframe column re-nameing
# lower_case all dataframe column names 
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)           # ????

## Handling Missing Values

In [None]:
# Missing values are usually excluded by default
df.column_x.value_counts()                               #exclude missing values
df.column_x.value_counts(dropna=False)                   #include missing values

In [None]:
# Find missing values in a Series
df.column_x.isnull()                                     #True if missing
df.column_x.notnull()                                    #True if not missing

In [None]:
# Use a boolean Series to filter dataframe rows
df[df.column_x.isnull()]                                 #only show rows where column_x is missing
df[df.column_x.notnull()]                                #only show rows where column_x not missing

In [None]:
# Understanding axis
df.sum()                                                 #sums 'down' the 0 axis(rows)
df.sum(axis=0)                                           #alternative(since axis=0 is default)
df.sum(axis=1)                                           #sums 'across' the 1 axis(columns)

In [None]:
# Adding booleans
pd.Series([True, False, True])                           #create a boolean Series
pd.Series([True, False, True]).sum()                     #converts False to 0 and True to 1

In [None]:
# Find missing values in dataframe
df.isnull()                                              #dataframe of booleans
df.isnull().sum()                                        #count the missing values in each column
df.isnull().sum().sum()                                  #count the total missing values of all dataframe

In [None]:
# Drop missing values
df.dropna(inplace=True)                                  #drop a row if ANY values are missing, default to rows, but can be applied to columns with axis=1
df.dropna(how='all', inplace=True)                       #drop a row only if ALL values are missing

In [None]:
# Fill in missing values
df.column_x.fillna(value='NaN', inplace=True)            #fill the missing values of column_x with 'NaN'
df.column_x.fillna(value=np.nan, inplace=True)           #alternative use numpy library
#value does not have to equal a string - can be set as some calculated value like df.column_x.mode(), or just a number like 0

In [None]:
# Turn off the missing value filter
df = pd.read_csv('df.csv', header=0, names=new_cols, na_filter=False)