## The Pandas DataFrame: Make Working With Data Delightful
___
[website link](https://realpython.com/pandas-dataframe/)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_data = {
...     'name': ['Xavier', 'Ann', 'Jana', 'Yi', 'Robin', 'Amal', 'Nori'],
...     'city': ['Mexico City', 'Toronto', 'Prague', 'Shanghai',
...              'Manchester', 'Cairo', 'Osaka'],
...     'age': [41, 28, 33, 34, 38, 31, 37],
...     'py-score': [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0]
... }

In [3]:
df_data

{'name': ['Xavier', 'Ann', 'Jana', 'Yi', 'Robin', 'Amal', 'Nori'],
 'city': ['Mexico City',
  'Toronto',
  'Prague',
  'Shanghai',
  'Manchester',
  'Cairo',
  'Osaka'],
 'age': [41, 28, 33, 34, 38, 31, 37],
 'py-score': [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0]}

In [4]:
row_labels = [101, 102, 103, 104, 105, 106, 107]

In [12]:
df = pd.DataFrame(df_data, index=row_labels)

In [7]:
df.head()

Unnamed: 0,name,city,age,py-score
101,Xavier,Mexico City,41,88.0
102,Ann,Toronto,28,79.0
103,Jana,Prague,33,81.0
104,Yi,Shanghai,34,80.0
105,Robin,Manchester,38,68.0


In [8]:
df['age']

101    41
102    28
103    33
104    34
105    38
106    31
107    37
Name: age, dtype: int64

In [10]:
df = pd.DataFrame(df_data, index=row_labels, columns={'name':'player_name', 'py-score':'score'})

In [11]:
df

Unnamed: 0,name,py-score
101,Xavier,88.0
102,Ann,79.0
103,Jana,81.0
104,Yi,80.0
105,Robin,68.0
106,Amal,61.0
107,Nori,84.0


In [13]:
df.to_numpy()

array([['Xavier', 'Mexico City', 41, 88.0],
       ['Ann', 'Toronto', 28, 79.0],
       ['Jana', 'Prague', 33, 81.0],
       ['Yi', 'Shanghai', 34, 80.0],
       ['Robin', 'Manchester', 38, 68.0],
       ['Amal', 'Cairo', 31, 61.0],
       ['Nori', 'Osaka', 37, 84.0]], dtype=object)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 101 to 107
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7 non-null      object 
 1   city      7 non-null      object 
 2   age       7 non-null      int64  
 3   py-score  7 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 280.0+ bytes


In [27]:
df = df.astype(dtype={'age':'uint8', 'py-score':'float16'})

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 101 to 107
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7 non-null      object 
 1   city      7 non-null      object 
 2   age       7 non-null      uint8  
 3   py-score  7 non-null      float16
dtypes: float16(1), object(2), uint8(1)
memory usage: 189.0+ bytes


In [29]:
df['name'].nbytes

56

In [19]:
df.ndim

2

In [20]:
df.shape

(7, 4)

In [21]:
df.size

28

In [25]:
df.memory_usage()

Index       56
name        56
city        56
age          7
py-score    14
dtype: int64

In [30]:
df

Unnamed: 0,name,city,age,py-score
101,Xavier,Mexico City,41,88.0
102,Ann,Toronto,28,79.0
103,Jana,Prague,33,81.0
104,Yi,Shanghai,34,80.0
105,Robin,Manchester,38,68.0
106,Amal,Cairo,31,61.0
107,Nori,Osaka,37,84.0


In [31]:
john = pd.Series(data=['John', 'Boston', 34, 79],
...                  index=df.columns, name=17)

In [32]:
john

name          John
city        Boston
age             34
py-score        79
Name: 17, dtype: object

In [33]:
type(john)

pandas.core.series.Series

In [35]:
df = df.append(john)

In [38]:
df.reset_index(inplace=True)

In [42]:
df.drop(axis=1, columns='index', inplace=True)

In [43]:
df

Unnamed: 0,name,city,age,py-score
0,Xavier,Mexico City,41,88.0
1,Ann,Toronto,28,79.0
2,Jana,Prague,33,81.0
3,Yi,Shanghai,34,80.0
4,Robin,Manchester,38,68.0
5,Amal,Cairo,31,61.0
6,Nori,Osaka,37,84.0
7,John,Boston,34,79.0


In [53]:
df.drop(index=[4, 5, 6, 7])

Unnamed: 0,name,city,age,py-score
0,Xavier,Mexico City,41,88.0
1,Ann,Toronto,28,79.0
2,Jana,Prague,33,81.0
3,Yi,Shanghai,34,80.0


In [54]:
df['won'] = np.array(False, dtype='bool')

In [57]:
df.memory_usage()

Index       128
name         64
city         64
age          64
py-score     64
won           8
dtype: int64

In [58]:
df['won'] = False
df.memory_usage()

Index       128
name         64
city         64
age          64
py-score     64
won           8
dtype: int64

In [62]:
df = df.astype(dtype={'age':'uint8', 'py-score':'float16'})

In [63]:
df.memory_usage()

Index       128
name         64
city         64
age           8
py-score     16
won           8
dtype: int64

In [65]:
df.rename(columns={'py-score':'score'}, inplace=True)
df

Unnamed: 0,name,city,age,score,won
0,Xavier,Mexico City,41,88.0,False
1,Ann,Toronto,28,79.0,False
2,Jana,Prague,33,81.0,False
3,Yi,Shanghai,34,80.0,False
4,Robin,Manchester,38,68.0,False
5,Amal,Cairo,31,61.0,False
6,Nori,Osaka,37,84.0,False
7,John,Boston,34,79.0,False


In [66]:
df.sort_values('age', ascending=False)

Unnamed: 0,name,city,age,score,won
0,Xavier,Mexico City,41,88.0,False
4,Robin,Manchester,38,68.0,False
6,Nori,Osaka,37,84.0,False
3,Yi,Shanghai,34,80.0,False
7,John,Boston,34,79.0,False
2,Jana,Prague,33,81.0,False
5,Amal,Cairo,31,61.0,False
1,Ann,Toronto,28,79.0,False


In [67]:
df.describe()

Unnamed: 0,age,score
count,8.0,8.0
mean,34.5,77.5
std,4.105745,8.765625
min,28.0,61.0
25%,32.5,76.25
50%,34.0,79.5
75%,37.25,81.75
max,41.0,88.0


In [68]:
df.sort_values(['age', 'name'], ascending=[False,True])

Unnamed: 0,name,city,age,score,won
0,Xavier,Mexico City,41,88.0,False
4,Robin,Manchester,38,68.0,False
6,Nori,Osaka,37,84.0,False
7,John,Boston,34,79.0,False
3,Yi,Shanghai,34,80.0,False
2,Jana,Prague,33,81.0,False
5,Amal,Cairo,31,61.0,False
1,Ann,Toronto,28,79.0,False


In [70]:
help(df.sample)

Help on method sample in module pandas.core.generic:

sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None) -> ~FrameOrSeries method of pandas.core.frame.DataFrame instance
    Return a random sample of items from an axis of object.
    
    You can use `random_state` for reproducibility.
    
    Parameters
    ----------
    n : int, optional
        Number of items from axis to return. Cannot be used with `frac`.
        Default = 1 if `frac` = None.
    frac : float, optional
        Fraction of axis items to return. Cannot be used with `n`.
    replace : bool, default False
        Allow or disallow sampling of the same row more than once.
    weights : str or ndarray-like, optional
        Default 'None' results in equal probability weighting.
        If passed a Series, will align with target object on index. Index
        values in weights not found in sampled object will be ignored and
        index values in sampled object not in weights will be

In [72]:
df['js-score'] = np.array([71.0, 95.0, 88.0, 79.0, 91.0, 91.0, 80.0, 100.0
        ])

In [73]:
df

Unnamed: 0,name,city,age,score,won,js-score
0,Xavier,Mexico City,41,88.0,False,71.0
1,Ann,Toronto,28,79.0,False,95.0
2,Jana,Prague,33,81.0,False,88.0
3,Yi,Shanghai,34,80.0,False,79.0
4,Robin,Manchester,38,68.0,False,91.0
5,Amal,Cairo,31,61.0,False,91.0
6,Nori,Osaka,37,84.0,False,80.0
7,John,Boston,34,79.0,False,100.0


In [74]:
df.insert(loc=4, column='django-score',
...           value=np.array([86.0, 81.0, 78.0, 88.0, 74.0, 70.0, 81.0, 85.0]))

In [75]:
df

Unnamed: 0,name,city,age,score,django-score,won,js-score
0,Xavier,Mexico City,41,88.0,86.0,False,71.0
1,Ann,Toronto,28,79.0,81.0,False,95.0
2,Jana,Prague,33,81.0,78.0,False,88.0
3,Yi,Shanghai,34,80.0,88.0,False,79.0
4,Robin,Manchester,38,68.0,74.0,False,91.0
5,Amal,Cairo,31,61.0,70.0,False,91.0
6,Nori,Osaka,37,84.0,81.0,False,80.0
7,John,Boston,34,79.0,85.0,False,100.0


In [76]:
df['django-score'].where(cond=df['django-score'] >= 80, other=0.0)

0    86.0
1    81.0
2     0.0
3    88.0
4     0.0
5     0.0
6    81.0
7    85.0
Name: django-score, dtype: float64

In [79]:
x = df['django-score'] >= 80


In [81]:
x.apply(lambda key: 1 if key else 0)

0    1
1    1
2    0
3    1
4    0
5    0
6    1
7    1
Name: django-score, dtype: int64

In [82]:
df.interpolate()

Unnamed: 0,name,city,age,score,django-score,won,js-score
0,Xavier,Mexico City,41,88.0,86.0,False,71.0
1,Ann,Toronto,28,79.0,81.0,False,95.0
2,Jana,Prague,33,81.0,78.0,False,88.0
3,Yi,Shanghai,34,80.0,88.0,False,79.0
4,Robin,Manchester,38,68.0,74.0,False,91.0
5,Amal,Cairo,31,61.0,70.0,False,91.0
6,Nori,Osaka,37,84.0,81.0,False,80.0
7,John,Boston,34,79.0,85.0,False,100.0


In [83]:
df.items()

<generator object DataFrame.items at 0x000002196DB54270>

In [84]:
for x1, y1 in df.items():
    print(x1, y1)

name 0    Xavier
1       Ann
2      Jana
3        Yi
4     Robin
5      Amal
6      Nori
7      John
Name: name, dtype: object
city 0    Mexico City
1        Toronto
2         Prague
3       Shanghai
4     Manchester
5          Cairo
6          Osaka
7         Boston
Name: city, dtype: object
age 0    41
1    28
2    33
3    34
4    38
5    31
6    37
7    34
Name: age, dtype: uint8
score 0    88.0
1    79.0
2    81.0
3    80.0
4    68.0
5    61.0
6    84.0
7    79.0
Name: score, dtype: float16
django-score 0    86.0
1    81.0
2    78.0
3    88.0
4    74.0
5    70.0
6    81.0
7    85.0
Name: django-score, dtype: float64
won 0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
Name: won, dtype: bool
js-score 0     71.0
1     95.0
2     88.0
3     79.0
4     91.0
5     91.0
6     80.0
7    100.0
Name: js-score, dtype: float64


In [85]:
dt = pd.date_range(start='2019-10-27 00:00:00.0', periods=24,
...                    freq='H')

In [86]:
temp_c = [ 8.0,  7.1,  6.8,  6.4,  6.0,  5.4,  4.8,  5.0,
...            9.1, 12.8, 15.3, 19.1, 21.2, 22.1, 22.4, 23.1,
...           21.0, 17.9, 15.5, 14.4, 11.9, 11.0, 10.2,  9.1]

In [87]:
dt

DatetimeIndex(['2019-10-27 00:00:00', '2019-10-27 01:00:00',
               '2019-10-27 02:00:00', '2019-10-27 03:00:00',
               '2019-10-27 04:00:00', '2019-10-27 05:00:00',
               '2019-10-27 06:00:00', '2019-10-27 07:00:00',
               '2019-10-27 08:00:00', '2019-10-27 09:00:00',
               '2019-10-27 10:00:00', '2019-10-27 11:00:00',
               '2019-10-27 12:00:00', '2019-10-27 13:00:00',
               '2019-10-27 14:00:00', '2019-10-27 15:00:00',
               '2019-10-27 16:00:00', '2019-10-27 17:00:00',
               '2019-10-27 18:00:00', '2019-10-27 19:00:00',
               '2019-10-27 20:00:00', '2019-10-27 21:00:00',
               '2019-10-27 22:00:00', '2019-10-27 23:00:00'],
              dtype='datetime64[ns]', freq='H')

In [90]:
df_t = pd.DataFrame(data={'temp':temp_c}, index=dt)
df_t

Unnamed: 0,temp
2019-10-27 00:00:00,8.0
2019-10-27 01:00:00,7.1
2019-10-27 02:00:00,6.8
2019-10-27 03:00:00,6.4
2019-10-27 04:00:00,6.0
2019-10-27 05:00:00,5.4
2019-10-27 06:00:00,4.8
2019-10-27 07:00:00,5.0
2019-10-27 08:00:00,9.1
2019-10-27 09:00:00,12.8


In [92]:
df_t.loc['2019-10-27 05':'2019-10-27 14']

Unnamed: 0,temp
2019-10-27 05:00:00,5.4
2019-10-27 06:00:00,4.8
2019-10-27 07:00:00,5.0
2019-10-27 08:00:00,9.1
2019-10-27 09:00:00,12.8
2019-10-27 10:00:00,15.3
2019-10-27 11:00:00,19.1
2019-10-27 12:00:00,21.2
2019-10-27 13:00:00,22.1
2019-10-27 14:00:00,22.4


In [95]:
df_t6h = df_t.resample(rule='6h')
df_t6h.mean()

Unnamed: 0,temp
2019-10-27 00:00:00,6.616667
2019-10-27 06:00:00,11.016667
2019-10-27 12:00:00,21.283333
2019-10-27 18:00:00,12.016667
