In [43]:
import pandas as pd
import numpy as np

from pydataset import data
from matplotlib import pyplot as plt

In [44]:
data()

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
...,...,...
752,VerbAgg,Verbal Aggression item responses
753,cake,Breakage Angle of Chocolate Cakes
754,cbpp,Contagious bovine pleuropneumonia
755,grouseticks,Data on red grouse ticks from Elston et al. 2001


## Read in /write out data

In [45]:
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
print(iris.head())
print(iris.info())

housing = data('Housing')
print(housing.head())
print(housing.info())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
     price  lotsize  bedrooms  bathrms  stories driveway recroom fullbase  \
1  42000.0     5850         3        1        2      yes      no      yes   


## Create a new data frame

In [46]:
df = pd.DataFrame({'A': 1.,  # Does recycling just like R does
                   'B': pd.Timestamp('2020-09-20'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(['test', 'train', 'test', 'train']),
                   'F': 'foo'
                  })

print(df.columns)
df.head()

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-09-20,1.0,3,test,foo
1,1.0,2020-09-20,1.0,3,train,foo
2,1.0,2020-09-20,1.0,3,test,foo
3,1.0,2020-09-20,1.0,3,train,foo


In [47]:
df.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


## Sort

In [48]:
# Sort by values
iris.sort_values(by=['species', 'sepal_length'])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
8,4.4,2.9,1.4,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
42,4.4,3.2,1.3,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
3,4.6,3.1,1.5,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
22,4.6,3.6,1.0,0.2,setosa
47,4.6,3.2,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [49]:
# Sort by an index
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-09-20,1.0,3,test,foo
1,1.0,2020-09-20,1.0,3,train,foo
2,1.0,2020-09-20,1.0,3,test,foo
3,1.0,2020-09-20,1.0,3,train,foo


In [50]:
iris.index

RangeIndex(start=0, stop=150, step=1)

In [51]:
iris.sort_index(axis=1)

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,species
0,1.4,0.2,5.1,3.5,setosa
1,1.4,0.2,4.9,3.0,setosa
2,1.3,0.2,4.7,3.2,setosa
3,1.5,0.2,4.6,3.1,setosa
4,1.4,0.2,5.0,3.6,setosa
5,1.7,0.4,5.4,3.9,setosa
6,1.4,0.3,4.6,3.4,setosa
7,1.5,0.2,5.0,3.4,setosa
8,1.4,0.2,4.4,2.9,setosa
9,1.5,0.1,4.9,3.1,setosa


In [52]:
# Slicing etc.
print(iris['species'].head())
print(iris[0:3])
print(iris.loc[:, 'sepal_length'].head())
print(iris[iris['species'] == 'versicolor'].head())

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal_length, dtype: float64
    sepal_length  sepal_width  petal_length  petal_width     species
50           7.0          3.2           4.7          1.4  versicolor
51           6.4          3.2           4.5          1.5  versicolor
52           6.9          3.1           4.9          1.5  versicolor
53           5.5          2.3           4.0          1.3  versicolor
54           6.5          2.8           4.6          1.5  versicolor


## Merge/join

In [53]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

print(df1.merge(df2, left_on='lkey', right_on='rkey'))

  lkey  value_x rkey  value_y
0  foo        1  foo        5
1  foo        1  foo        8
2  foo        5  foo        5
3  foo        5  foo        8
4  bar        2  bar        6
5  baz        3  baz        7


## Group by/aggregate

In [59]:
iris.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [61]:
iris.groupby('species').max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [None]:
iris.groupby('species').aggregate()

## Add rows, add columns

In [54]:
df['new_col'] = ['tomorrow', 'creeps', 'petty', 'pace']
df

Unnamed: 0,A,B,C,D,E,F,new_col
0,1.0,2020-09-20,1.0,3,test,foo,tomorrow
1,1.0,2020-09-20,1.0,3,train,foo,creeps
2,1.0,2020-09-20,1.0,3,test,foo,petty
3,1.0,2020-09-20,1.0,3,train,foo,pace


## Remove/identify duplicates

## Identify missing values

In [55]:
housing[housing.isna().any(axis=1)]

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea


## Moving windows

## Linear regression

## Logistic regression

## K-means clustering

## Cross-validation

## Scatterplot

## Barplot

## String manipulations