# Dataframe

- Axis 0 is the index
- Axis 1 is the columns

## Chapter 17 - Introduction

In [1]:
import pandas as pd

In [2]:
#
# Create a dataframe
#
df = pd.DataFrame(
    {
        "a": [100, 200, 300],
        "b": [400, 500, 600],
        "c": [700, 800, 900],
    }
)
df

Unnamed: 0,a,b,c
0,100,400,700
1,200,500,800
2,300,600,900


In [3]:
#
# Access a column, which is a series
#
df["b"]

0    400
1    500
2    600
Name: b, dtype: int64

In [4]:
type(df["b"])

pandas.core.series.Series

In [5]:
#
# Alternative way to create a dataframe
#
df = pd.DataFrame(
    data=[[100, 200, 300], [400, 500, 600], [700, 800, 900]],
    index=["first", "second", "third"],
    columns=["a", "b", "c"],
)
df

Unnamed: 0,a,b,c
first,100,200,300
second,400,500,600
third,700,800,900


## Attributes

In [6]:
richest = pd.read_csv("TopRichestInWorld.csv")

In [7]:
richest

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
0,Elon Musk,"$219,000,000,000",50,United States,"Tesla, SpaceX",Automotive
1,Jeff Bezos,"$171,000,000,000",58,United States,Amazon,Technology
2,Bernard Arnault & family,"$158,000,000,000",73,France,LVMH,Fashion & Retail
3,Bill Gates,"$129,000,000,000",66,United States,Microsoft,Technology
4,Warren Buffett,"$118,000,000,000",91,United States,Berkshire Hathaway,Finance & Investments
...,...,...,...,...,...,...
96,Vladimir Potanin,"$17,300,000,000",61,Russia,metals,Metals & Mining
97,Harold Hamm & family,"$17,200,000,000",76,United States,oil & gas,Energy
98,Sun Piaoyang,"$17,100,000,000",63,China,pharmaceuticals,Healthcare
99,Luo Liguo & family,"$17,000,000,000",66,China,chemicals,Manufacturing


In [8]:
richest.shape  # Tuple (number of rows, number of columns)

(101, 6)

In [9]:
richest.size  # number of rows * number of columns

606

In [10]:
richest.index  # Show the index

RangeIndex(start=0, stop=101, step=1)

In [11]:
richest.columns

Index(['Name', 'NetWorth', 'Age', 'Country/Territory', 'Source', 'Industry'], dtype='object')

In [12]:
richest.axes  # Info on both index and columns

[RangeIndex(start=0, stop=101, step=1),
 Index(['Name', 'NetWorth', 'Age', 'Country/Territory', 'Source', 'Industry'], dtype='object')]

In [13]:
richest.dtypes

Name                 object
NetWorth             object
Age                   int64
Country/Territory    object
Source               object
Industry             object
dtype: object

In [14]:
richest.head(5).values

array([['Elon Musk', '$219,000,000,000', 50, 'United States',
        'Tesla, SpaceX', 'Automotive'],
       ['Jeff Bezos', '$171,000,000,000', 58, 'United States', 'Amazon',
        'Technology'],
       ['Bernard Arnault & family', '$158,000,000,000', 73, 'France',
        'LVMH', 'Fashion & Retail'],
       ['Bill Gates', '$129,000,000,000', 66, 'United States',
        'Microsoft', 'Technology'],
       ['Warren Buffett', '$118,000,000,000', 91, 'United States',
        'Berkshire Hathaway', 'Finance & Investments']], dtype=object)

## Chapter 18 - Methods

In [15]:
series = pd.Series([100, 200, 300])

In [16]:
series

0    100
1    200
2    300
dtype: int64

In [17]:
df = pd.DataFrame(
    {
        "a": [100, 200, 300],
        "b": [400, 500, 600],
        "c": [700, 800, 900],
    }
)

In [18]:
df

Unnamed: 0,a,b,c
0,100,400,700
1,200,500,800
2,300,600,900


In [19]:
# sum of series
series.sum()

np.int64(600)

In [20]:
# sum of dataframe
df.sum()

a     600
b    1500
c    2400
dtype: int64

In [21]:
# By default, df.sum() will sum the columns
# This is the same as the above
df.sum(axis=0)

a     600
b    1500
c    2400
dtype: int64

In [25]:
# Also same
df.sum(axis="index")

a     600
b    1500
c    2400
dtype: int64

In [22]:
# To sum along the index
df.sum(axis=1)

0    1200
1    1500
2    1800
dtype: int64

In [24]:
# Same as the above
df.sum(axis="columns")

0    1200
1    1500
2    1800
dtype: int64

## Chapter 19 - describe() and info()

In [26]:
import pandas as pd

In [33]:
selected_columns = [
    "Series_Title",
    "IMDB_Rating",
    "No_of_Votes",
    "Gross",
    "Released_Year",
    "Genre",
]

In [75]:
movies = pd.read_csv("imdb_top_1000.csv", usecols=selected_columns)

In [76]:
movies

Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,No_of_Votes,Gross
0,The Shawshank Redemption,1994,Drama,9.3,2343110,28341469
1,The Godfather,1972,"Crime, Drama",9.2,1620367,134966411
2,The Dark Knight,2008,"Action, Crime, Drama",9.0,2303232,534858444
3,The Godfather: Part II,1974,"Crime, Drama",9.0,1129952,57300000
4,12 Angry Men,1957,"Crime, Drama",9.0,689845,4360000
...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,"Comedy, Drama, Romance",7.6,166544,
996,Giant,1956,"Drama, Western",7.6,34075,
997,From Here to Eternity,1953,"Drama, Romance, War",7.6,43374,30500000
998,Lifeboat,1944,"Drama, War",7.6,26471,


In [77]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series_Title   1000 non-null   object 
 1   Released_Year  1000 non-null   object 
 2   Genre          1000 non-null   object 
 3   IMDB_Rating    1000 non-null   float64
 4   No_of_Votes    1000 non-null   int64  
 5   Gross          831 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 47.0+ KB


In [78]:
movies.describe().round()

Unnamed: 0,IMDB_Rating,No_of_Votes
count,1000.0,1000.0
mean,8.0,273693.0
std,0.0,327373.0
min,8.0,25088.0
25%,8.0,55526.0
50%,8.0,138548.0
75%,8.0,374161.0
max,9.0,2343110.0


### Santitize Gross

In [79]:
movies["Gross"] = movies["Gross"].str.replace(",", "")

In [80]:
movies

Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,No_of_Votes,Gross
0,The Shawshank Redemption,1994,Drama,9.3,2343110,28341469
1,The Godfather,1972,"Crime, Drama",9.2,1620367,134966411
2,The Dark Knight,2008,"Action, Crime, Drama",9.0,2303232,534858444
3,The Godfather: Part II,1974,"Crime, Drama",9.0,1129952,57300000
4,12 Angry Men,1957,"Crime, Drama",9.0,689845,4360000
...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,"Comedy, Drama, Romance",7.6,166544,
996,Giant,1956,"Drama, Western",7.6,34075,
997,From Here to Eternity,1953,"Drama, Romance, War",7.6,43374,30500000
998,Lifeboat,1944,"Drama, War",7.6,26471,


In [81]:
# Works, but not efficient since we are making copies
# movies["Gross"] = movies["Gross"].fillna(0)

# Generates warnings
# movies["Gross"].fillna(0, inplace=True)

# This works
movies.fillna({"Gross": 0}, inplace=True)

In [82]:
movies

Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,No_of_Votes,Gross
0,The Shawshank Redemption,1994,Drama,9.3,2343110,28341469
1,The Godfather,1972,"Crime, Drama",9.2,1620367,134966411
2,The Dark Knight,2008,"Action, Crime, Drama",9.0,2303232,534858444
3,The Godfather: Part II,1974,"Crime, Drama",9.0,1129952,57300000
4,12 Angry Men,1957,"Crime, Drama",9.0,689845,4360000
...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,"Comedy, Drama, Romance",7.6,166544,0
996,Giant,1956,"Drama, Western",7.6,34075,0
997,From Here to Eternity,1953,"Drama, Romance, War",7.6,43374,30500000
998,Lifeboat,1944,"Drama, War",7.6,26471,0


In [83]:
movies["Gross"] = movies["Gross"].astype("int64")

In [84]:
movies

Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,No_of_Votes,Gross
0,The Shawshank Redemption,1994,Drama,9.3,2343110,28341469
1,The Godfather,1972,"Crime, Drama",9.2,1620367,134966411
2,The Dark Knight,2008,"Action, Crime, Drama",9.0,2303232,534858444
3,The Godfather: Part II,1974,"Crime, Drama",9.0,1129952,57300000
4,12 Angry Men,1957,"Crime, Drama",9.0,689845,4360000
...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,"Comedy, Drama, Romance",7.6,166544,0
996,Giant,1956,"Drama, Western",7.6,34075,0
997,From Here to Eternity,1953,"Drama, Romance, War",7.6,43374,30500000
998,Lifeboat,1944,"Drama, War",7.6,26471,0


In [85]:
#
# Now that Gross is normalized, describe again
#
movies.describe().round()

Unnamed: 0,IMDB_Rating,No_of_Votes,Gross
count,1000.0,1000.0,1000.0
mean,8.0,273693.0,56536878.0
std,0.0,327373.0,103238179.0
min,8.0,25088.0,0.0
25%,8.0,55526.0,445710.0
50%,8.0,138548.0,10702752.0
75%,8.0,374161.0,61539891.0
max,9.0,2343110.0,936662225.0


## Chapter 20 - nlargest() and nsmallest()

In [1]:
import pandas as pd

In [4]:
cols = ["Series_Title", "IMDB_Rating", "No_of_Votes"]

In [5]:
movies = pd.read_csv("imdb_top_1000.csv", usecols=cols)

In [6]:
movies

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
0,The Shawshank Redemption,9.3,2343110
1,The Godfather,9.2,1620367
2,The Dark Knight,9.0,2303232
3,The Godfather: Part II,9.0,1129952
4,12 Angry Men,9.0,689845
...,...,...,...
995,Breakfast at Tiffany's,7.6,166544
996,Giant,7.6,34075
997,From Here to Eternity,7.6,43374
998,Lifeboat,7.6,26471


In [7]:
#
# nlargest by number of votes
#
movies.nlargest(n=10, columns=["No_of_Votes"])

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
0,The Shawshank Redemption,9.3,2343110
2,The Dark Knight,9.0,2303232
8,Inception,8.8,2067042
9,Fight Club,8.8,1854740
6,Pulp Fiction,8.9,1826188
11,Forrest Gump,8.8,1809221
14,The Matrix,8.7,1676426
10,The Lord of the Rings: The Fellowship of the Ring,8.8,1661481
5,The Lord of the Rings: The Return of the King,8.9,1642758
1,The Godfather,9.2,1620367


In [8]:
#
# largest by multiple columns
#
movies.nlargest(n=10, columns=["IMDB_Rating", "No_of_Votes"])

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
0,The Shawshank Redemption,9.3,2343110
1,The Godfather,9.2,1620367
2,The Dark Knight,9.0,2303232
3,The Godfather: Part II,9.0,1129952
4,12 Angry Men,9.0,689845
6,Pulp Fiction,8.9,1826188
5,The Lord of the Rings: The Return of the King,8.9,1642758
7,Schindler's List,8.9,1213505
8,Inception,8.8,2067042
9,Fight Club,8.8,1854740


In [12]:
# nsmallest is the opposite
movies.nsmallest(n=10, columns=["IMDB_Rating", "No_of_Votes"])

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
989,The Long Goodbye,7.6,26337
998,Lifeboat,7.6,26471
971,Omohide poro poro,7.6,27071
981,On Golden Pond,7.6,27650
880,Nelyubov,7.6,29765
990,Giù la testa,7.6,30144
920,The Secret of Kells,7.6,31779
984,The Muppet Movie,7.6,32802
986,Watership Down,7.6,33656
996,Giant,7.6,34075


In [13]:
movies.nsmallest(10, columns=["No_of_Votes"])

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
264,Ba wang bie ji,8.1,25088
721,God's Own Country,7.7,25198
694,La planète sauvage,7.8,25229
718,Scarface: The Shame of the Nation,7.8,25312
570,Raazi,7.8,25344
785,The Magdalene Sisters,7.7,25938
989,The Long Goodbye,7.6,26337
169,Dom za vesanje,8.2,26402
814,Do lok tin si,7.7,26429
863,Cape Fear,7.7,26457


### The confusing keep=

In [18]:
movies.nlargest(10, columns=["IMDB_Rating"])

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
0,The Shawshank Redemption,9.3,2343110
1,The Godfather,9.2,1620367
2,The Dark Knight,9.0,2303232
3,The Godfather: Part II,9.0,1129952
4,12 Angry Men,9.0,689845
5,The Lord of the Rings: The Return of the King,8.9,1642758
6,Pulp Fiction,8.9,1826188
7,Schindler's List,8.9,1213505
8,Inception,8.8,2067042
9,Fight Club,8.8,1854740


By default, when there are duplicate, Pandas is going to keep the first row. The doc said:

```
keep
{‘first’, ‘last’, ‘all’}, default ‘first’
Where there are duplicate values:

first : prioritize the first occurrence(s)

last : prioritize the last occurrence(s)

all : keep all the ties of the smallest item even if it means selecting more than n items.

```


In [20]:
# keep all means we might have more than n rows
movies.nlargest(10, columns=["IMDB_Rating"], keep="all")

Unnamed: 0,Series_Title,IMDB_Rating,No_of_Votes
0,The Shawshank Redemption,9.3,2343110
1,The Godfather,9.2,1620367
2,The Dark Knight,9.0,2303232
3,The Godfather: Part II,9.0,1129952
4,12 Angry Men,9.0,689845
5,The Lord of the Rings: The Return of the King,8.9,1642758
6,Pulp Fiction,8.9,1826188
7,Schindler's List,8.9,1213505
8,Inception,8.8,2067042
9,Fight Club,8.8,1854740
