# Pandas

Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

* A fast and efficient DataFrame object for data manipulation with integrated indexing.

* Tools for reading and writing data between in-memory data structures and different formats: CSV and text files, Microsoft Excel, SQL databases.

* Intelligent data alignment and integrated handling of missing data: gain automatic label-based alignment in computations and easily manipulate messy data into an orderly form.

* Flexible reshaping and pivoting of data sets.

* Intelligent label-based slicing, fancy indexing, and subsetting of large data sets.

* Aggregating or transforming data with a powerful group by engine allowing split-apply-combine operations on data sets.

* High performance merging and joining of data sets.

* Time series-functionality: date range generation and frequency conversion, moving window statistics, date shifting and lagging. 

* Highly optimized for performance, with critical code paths written in Cython or C.

You can find pandas documentation [here](https://pandas.pydata.org/docs/).

In [None]:
# best practice import
import pandas as pd

# set display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# read data from csv
df = pd.read_csv('listings.csv')

In [None]:
# print first 5 rows
df.head(5)

In [None]:
# print last 7 rows
df.tail(7)

In [None]:
# print shape of data
df.shape

In [None]:
# info
df.info()

In [None]:
# missing values
df.isna()

In [None]:
# missing values summary
df.isna().sum()

In [None]:
# drop missing value from 'name' column
df.dropna(subset=['name'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
# numeric data types description
df.describe().round(2)

In [None]:
# print dataframe columnns
list(df.columns)

In [None]:
# one row of dataframe => series 
df.host_name

In [None]:
# check the type
type(df.host_name)

In [None]:
# another way to acces one columns
df['host_name']

In [None]:
type(df['host_name'])

In [None]:
# one column as dataframe
df[['host_name']]

In [None]:
# check the type
type(df[['host_name']])

In [None]:
# select columns from dataframe
columns = ['host_name','host_location','host_since']
df[columns]

In [None]:
# loc method to filter dataframe
df.loc[0, 'host_name']

In [None]:
df.head(2)

In [None]:
# iloc method to filter dataframe
df.iloc[0,1]

In [None]:
# check data types
df.dtypes

In [None]:
# mean method on a series
df['review_scores_rating'].mean()

In [None]:
# broadcasting
df['review_scores_rating'] - 50

In [None]:
# broadcasting
df['review_scores_rating'] - df['review_scores_rating'].mean()

In [None]:
# compute values and assign to a new column
df['review_score_above_average'] = df['review_scores_rating'] - df['review_scores_rating'].mean()

In [None]:
df.head()

In [None]:
df[['review_scores_rating','review_score_above_average']]

In [None]:
# multiplication with different data types
df['min_revenue'] = df['minimum_nights'] * df['price']
df[['minimum_nights', 'price', 'min_revenue']].head(15)

In [None]:
df[['minimum_nights','price']].dtypes

In [None]:
# clean the price columns
df['price'] = df['price'].str[1:].str.replace(',','').astype(float)

In [None]:
# compute min_revenue again with right data types
df['min_revenue'] = df['minimum_nights'] * df['price']
df[['minimum_nights', 'price', 'min_revenue']].head(15)

In [None]:
df['last_review']

In [None]:
# conversion to datetime
df['last_review'] = pd.to_datetime(df['last_review'])
df.dtypes

In [None]:
# create a new column with year only
df['year'] = df['last_review'].dt.year

In [None]:
df[['last_review','year']]

In [None]:
# string connnversion to lowercase
df['name_lower'] = df['name'].str.lower()

In [None]:
df[['name','name_lower']]

In [None]:
# check the unique values from a column
df['host_has_profile_pic'].unique()

In [None]:
# create a boolean series
df['host_has_profile_pic'] == 'f'

In [None]:
# subseting dataframe based on boolean series
df_subset = df[(df['host_has_profile_pic'] == 't') & (df['bedrooms'] > 3)]

In [None]:
df_subset[['host_has_profile_pic','bedrooms']]

In [None]:
# compute average room price according to type
df[['room_type', 'price']].groupby('room_type',as_index=False).mean()

In [None]:
# compute average room price according to room_type and year
analysis_result = df[['room_type', 'year', 'price']].groupby(['room_type', 'year'], as_index=False).mean()
analysis_result

In [None]:
# export to csv
analysis_result.to_csv('analysis_result.csv',index=False)

# Stretch content

In [None]:
dct = {
    'a': [1,2,3,4],
    'b': [5,6,7,8]
}

df = pd.DataFrame(dct)
df

In [None]:
# function that add one to an inserted value
def add_one(value):
    return value + 1

In [None]:
# apply function to all values in a dataframe
df.apply(add_one)

In [None]:
# create a new column 'c' 
df['c'] = df['a'].apply(add_one)
df

In [None]:
# return sum of values from column 'a' and column 'b' 
def sum_a_and_b_columns(row):
    a_value = row['a']
    b_value = row['b']
    
    return a_value + b_value

In [None]:
# apply to each row
df['d'] = df.apply(sum_a_and_b_columns, axis=1)

In [None]:
df

In [None]:
# performance of apply
%timeit df.apply(sum_a_and_b_columns, axis=1)

In [None]:
# performance of vectorized arrays 
%timeit df['a'] + df['b']