# Pandas Demo


In [None]:
import pandas as pd


## Pandas' Core Data Structures

### Dataframes


In [None]:
# sample dataframe with yes/no answers recorded for two groups
pd.DataFrame({
    'Gender': ['male', 'female'],
    'Age': [42, 35]
})


In [None]:
# sample dataframe with index
pd.DataFrame(
    {'Gender': ['male', 'female'], 'Age': [42, 35]},
    index=['Harry', 'Sally'])


### Series


In [None]:
# sample series
pd.Series([1, 2, 3, 4, 5])


In [None]:
# sample series with index and name
pd.Series([200, 230, 210], index=['2018', '2019', '2020'], name='Items Sold')


### Import and Explore Dataframe from CSV


In [None]:
csv_path = 'data/netflix_titles.csv'
df = pd.read_csv(csv_path)


In [None]:
df.shape


In [None]:
df.head(5)


In [None]:
df.tail(5)


In [None]:
# read csv and specify index column
df = pd.read_csv(csv_path, index_col=0)
df.head(3)


In [None]:
df = df.reset_index()
df.head(3)


## Data Selection

### Choose one or more columns


In [None]:
# first column
df.title


In [None]:
# first column
df['title']


In [None]:
# first element of the column
df['title'][0]


In [None]:
# choose multiple columns -> gives you another DataFrame
df[["type", "title", "date_added", "duration"]]


### Integer Indexing with `iloc`


In [None]:
# select first row with iloc
df.iloc[0]


In [None]:
# select first column with iloc
df.iloc[:, 0]


In [None]:
# select first column and first 5 rows
df.iloc[:5, 0]


In [None]:
# select first column and first and third rows
df.iloc[[0, 2], 0]


In [None]:
# select first column and last 10 rows
df.iloc[-10:, 0]


### Indexing with `loc`


In [None]:
# select cast of the third movie
df.loc[2, 'cast']


In [None]:
# select title, director and cast for all movies
df.loc[:, ['title', 'director', 'cast']]


### Choose by condition


In [None]:
# check if movie's country of origin is the US
df.country == 'United States'


In [None]:
# choose all the entries where the country is the US
df.loc[df.country == 'United States']


In [None]:
# all US movies released after 2017
df.loc[(df.country == 'United States') & (df.release_year >= 2017)]


In [None]:
# all US movies OR movies released after 2017
df.loc[(df.country == 'United States') | (df.release_year >= 2017)]


In [None]:
# all movies made in US and UK
df.loc[df.country.isin(['United States', 'United Kingdom'])]


In [None]:
# all movies where director is specified
df.loc[df.director.notnull()]


In [None]:
# use index for easy filtering (alternative)
df_by_country = df.set_index("country")
display(df_by_country.head(3))

df_by_country.loc["United States"]


In [None]:
# assign a column with constant value
df['Watched'] = 'Yes'


In [None]:
# create derived information
df["title_length"] = df["title"].apply(lambda title: len(title))

df.sort_values("title_length", ascending=False).iloc[0]


## Summaries


In [None]:
# concise overview with technical information
df.info()


In [None]:
# fundamental statistics
df.describe()


In [None]:
# count unique elements
df.nunique()


In [None]:
# list unique elements
df.rating.unique()


In [None]:
# select only integers
df.select_dtypes(exclude=['int']).head(5)


In [None]:
# count non-NA cells per column
df.count()


In [None]:
# get max value per column
df.max()


In [None]:
# get min value for one column
df.release_year.min()


## Use-Cases and Exploratory Data Analysis


In [None]:
# inspect unique entries
display(df.nunique())
df.type.unique()


In [None]:
# show 3 movies and 3 tv series
is_movie = df["type"] == "Movie"
display(df[is_movie].sample(3))
display(df[~is_movie].sample(3))


In [None]:
# continue working with movies
movies = df[is_movie].copy()
movies.info()


In [None]:
# inspect duration field
movies.duration.unique()


In [None]:
# transform duration
movies["duration"] = movies["duration"].apply(lambda time: time.split()[0])
movies.duration


In [None]:
movies.duration = movies.duration.astype(int)
movies.describe()


In [None]:
movies.nsmallest(5, "duration")


## Merging Two Dataframes


In [None]:
df_fruit_prices = pd.DataFrame({'product': ['apple', 'banana', 'cherry'],
                                'price': [0.5, 0.3, 0.05]})
df_fruit_colours = pd.DataFrame({'fruit': ['apple', 'banana', 'cherry'],
                                 'colour': ['green', 'yellow', 'red']})
display(df_fruit_prices, df_fruit_colours)


In [None]:
# merge the dataframes using two specific columns
df_fruit_prices.merge(df_fruit_colours, left_on='product', right_on='fruit')


In [None]:
# rename 'product' column to 'fruit' column
df_fruit_prices = df_fruit_prices.rename(columns={'product': 'fruit'})
df_fruit_prices


In [None]:
# inner merge on 'fruit' column
df_fruit_prices.merge(df_fruit_colours, how='inner', on='fruit')
