# Dataframe

- Axis 0 is the index
- Axis 1 is the columns

## Chapter 17 - Introduction

In [1]:
import pandas as pd

In [2]:
#
# Create a dataframe
#
df = pd.DataFrame(
    {
        "a": [100, 200, 300],
        "b": [400, 500, 600],
        "c": [700, 800, 900],
    }
)
df

Unnamed: 0,a,b,c
0,100,400,700
1,200,500,800
2,300,600,900


In [3]:
#
# Access a column, which is a series
#
df["b"]

0    400
1    500
2    600
Name: b, dtype: int64

In [4]:
type(df["b"])

pandas.core.series.Series

In [5]:
#
# Alternative way to create a dataframe
#
df = pd.DataFrame(
    data=[[100, 200, 300], [400, 500, 600], [700, 800, 900]],
    index=["first", "second", "third"],
    columns=["a", "b", "c"],
)
df

Unnamed: 0,a,b,c
first,100,200,300
second,400,500,600
third,700,800,900


## Attributes

In [6]:
richest = pd.read_csv("TopRichestInWorld.csv")

In [7]:
richest

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
0,Elon Musk,"$219,000,000,000",50,United States,"Tesla, SpaceX",Automotive
1,Jeff Bezos,"$171,000,000,000",58,United States,Amazon,Technology
2,Bernard Arnault & family,"$158,000,000,000",73,France,LVMH,Fashion & Retail
3,Bill Gates,"$129,000,000,000",66,United States,Microsoft,Technology
4,Warren Buffett,"$118,000,000,000",91,United States,Berkshire Hathaway,Finance & Investments
...,...,...,...,...,...,...
96,Vladimir Potanin,"$17,300,000,000",61,Russia,metals,Metals & Mining
97,Harold Hamm & family,"$17,200,000,000",76,United States,oil & gas,Energy
98,Sun Piaoyang,"$17,100,000,000",63,China,pharmaceuticals,Healthcare
99,Luo Liguo & family,"$17,000,000,000",66,China,chemicals,Manufacturing


In [8]:
richest.shape  # Tuple (number of rows, number of columns)

(101, 6)

In [9]:
richest.size  # number of rows * number of columns

606

In [10]:
richest.index  # Show the index

RangeIndex(start=0, stop=101, step=1)

In [11]:
richest.columns

Index(['Name', 'NetWorth', 'Age', 'Country/Territory', 'Source', 'Industry'], dtype='object')

In [12]:
richest.axes  # Info on both index and columns

[RangeIndex(start=0, stop=101, step=1),
 Index(['Name', 'NetWorth', 'Age', 'Country/Territory', 'Source', 'Industry'], dtype='object')]

In [13]:
richest.dtypes

Name                 object
NetWorth             object
Age                   int64
Country/Territory    object
Source               object
Industry             object
dtype: object

In [14]:
richest.head(5).values

array([['Elon Musk', '$219,000,000,000', 50, 'United States',
        'Tesla, SpaceX', 'Automotive'],
       ['Jeff Bezos', '$171,000,000,000', 58, 'United States', 'Amazon',
        'Technology'],
       ['Bernard Arnault & family', '$158,000,000,000', 73, 'France',
        'LVMH', 'Fashion & Retail'],
       ['Bill Gates', '$129,000,000,000', 66, 'United States',
        'Microsoft', 'Technology'],
       ['Warren Buffett', '$118,000,000,000', 91, 'United States',
        'Berkshire Hathaway', 'Finance & Investments']], dtype=object)

## Chapter 18 - Methods

In [15]:
series = pd.Series([100, 200, 300])

In [16]:
series

0    100
1    200
2    300
dtype: int64

In [17]:
df = pd.DataFrame(
    {
        "a": [100, 200, 300],
        "b": [400, 500, 600],
        "c": [700, 800, 900],
    }
)

In [18]:
df

Unnamed: 0,a,b,c
0,100,400,700
1,200,500,800
2,300,600,900


In [19]:
# sum of series
series.sum()

np.int64(600)

In [20]:
# sum of dataframe
df.sum()

a     600
b    1500
c    2400
dtype: int64

In [21]:
# By default, df.sum() will sum the columns
# This is the same as the above
df.sum(axis=0)

a     600
b    1500
c    2400
dtype: int64

In [None]:
# Also same
df.sum(a

In [22]:
# To sum along the index
df.sum(axis=1)

0    1200
1    1500
2    1800
dtype: int64

In [24]:
# Same as the above
df.sum(axis="columns")

0    1200
1    1500
2    1800
dtype: int64