# Pandas

In [1]:
import pandas as pd
import numpy as np

## Series

### Basic Init

a. Using Lists

In [2]:
s = pd.Series(["India","USA","Russia","Norway","China","UK"], index=['a', 'b', 'c', 'd', 'e', 'f'], name="Numbers")
# index is optional, by default, a 0-n index will be given

In [3]:
s

a     India
b       USA
c    Russia
d    Norway
e     China
f        UK
Name: Numbers, dtype: object

b. Using a dictionary

In [4]:
d = pd.Series({'x': 9, 'y': 8, 'z': 7}, name='bruh')

In [5]:
d

x    9
y    8
z    7
Name: bruh, dtype: int64

c. Using a CSV file

In [6]:
df = pd.read_csv("data/nile.csv")['Flood']
df

0       9.9974
1      10.5556
2       9.9014
3      11.4800
4      12.8460
        ...   
565    11.3840
566     9.8630
567    10.8450
568    10.4020
569    10.3060
Name: Flood, Length: 570, dtype: float64

In [7]:
df.head()

0     9.9974
1    10.5556
2     9.9014
3    11.4800
4    12.8460
Name: Flood, dtype: float64

d. Using another Series

In [8]:
p = pd.Series(s,index=['a','c','d','f'])

In [9]:
p

a     India
c    Russia
d    Norway
f        UK
Name: Numbers, dtype: object

### Basic properties

In [10]:
s.name

'Numbers'

In [11]:
s.dtype

dtype('O')

In [12]:
s.values

array(['India', 'USA', 'Russia', 'Norway', 'China', 'UK'], dtype=object)

In [13]:
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [14]:
s.value_counts()

India     1
USA       1
Russia    1
Norway    1
China     1
UK        1
Name: Numbers, dtype: int64

### Accessing & Modifying data

When it comes to accessing and modifying data, we treat it like a dict

In [15]:
s['a'] 

'India'

In [16]:
d.iloc[1]

8

We can select multiple at once

In [17]:
s[['a','c','f']]

a     India
c    Russia
f        UK
Name: Numbers, dtype: object

In [18]:
df.iloc[[60, 61, 63, 69]]


60    9.2280
61    9.5736
63    9.0922
69    8.9578
Name: Flood, dtype: float64

Slicing also works

In [19]:
s['b':'e']

b       USA
c    Russia
d    Norway
e     China
Name: Numbers, dtype: object

In [20]:
df.iloc[50:54]

50     9.0730
51     9.2664
52     9.7670
53    10.3060
Name: Flood, dtype: float64

We can also use conditonal selection with boolean arrays

In [21]:
d>8

x     True
y    False
z    False
Name: bruh, dtype: bool

In [22]:
d[d>=8]

x    9
y    8
Name: bruh, dtype: int64

In [23]:
d['x'] = 10
d

x    10
y     8
z     7
Name: bruh, dtype: int64

In [24]:
d.iloc[2] = 12
d

x    10
y     8
z    12
Name: bruh, dtype: int64

In [25]:
d[d >= 10] = d-5
d

x    5
y    8
z    7
Name: bruh, dtype: int64

### Functions

In [26]:
df.mean()

10.780632631578943

In [27]:
df.std()

0.8836778665316819

In [28]:
np.log(df)


0      2.302325
1      2.356657
2      2.292676
3      2.440606
4      2.553032
         ...   
565    2.432209
566    2.288790
567    2.383704
568    2.341998
569    2.332726
Name: Flood, Length: 570, dtype: float64

## Dataframes

### Basic init

a. Using CSV

In [29]:
df = pd.read_csv("data/oscar_age_male.csv")
df

Unnamed: 0,Index,Year,Age,Name,Movie
0,1,1928,44,Emil Jannings,"The Last Command, The Way of All Flesh"
1,2,1929,41,Warner Baxter,In Old Arizona
2,3,1930,62,George Arliss,Disraeli
3,4,1931,53,Lionel Barrymore,A Free Soul
4,5,1932,47,Wallace Beery,The Champ
...,...,...,...,...,...
84,85,2012,39,Jean Dujardin,The Artist
85,86,2013,55,Daniel Day-Lewis,Lincoln
86,87,2014,44,Matthew McConaughey,Dallas Buyers Club
87,88,2015,33,Eddie Redmayne,The Theory of Everything\t


b. Using a dict of {column:list}

In [30]:
sdf = pd.DataFrame({'Name':
                   ['Hardik', 'John', "Mark", 'Steve', "Ak", "priti", "Tanmay"],
                   'Score': [
                       1785387,
                       2833687,
                       3874437,
                       2167744,
                       4602367,
                       2950039,
                       17348075
                   ],
                   'Continent': [
                       'America',
                       'Europe',
                       'Europe',
                       'Europe',
                       'Asia',
                       'Europe',
                       'America'
                   ]}, columns=['Name', 'Score', 'Continent'])
sdf

Unnamed: 0,Name,Score,Continent
0,Hardik,1785387,America
1,John,2833687,Europe
2,Mark,3874437,Europe
3,Steve,2167744,Europe
4,Ak,4602367,Asia
5,priti,2950039,Europe
6,Tanmay,17348075,America


In [31]:
sdf.index = sdf['Name'].values

In [32]:
sdf

Unnamed: 0,Name,Score,Continent
Hardik,Hardik,1785387,America
John,John,2833687,Europe
Mark,Mark,3874437,Europe
Steve,Steve,2167744,Europe
Ak,Ak,4602367,Asia
priti,priti,2950039,Europe
Tanmay,Tanmay,17348075,America


In [33]:
del sdf['Name']

In [34]:
sdf

Unnamed: 0,Score,Continent
Hardik,1785387,America
John,2833687,Europe
Mark,3874437,Europe
Steve,2167744,Europe
Ak,4602367,Asia
priti,2950039,Europe
Tanmay,17348075,America


In [35]:
# sdf.index = np.arange(sdf.shape[0])


In [36]:
# sdf

In [37]:
df.size

445

In [38]:
df.shape

(89, 5)

In [39]:
df.describe()

Unnamed: 0,Index,Year,Age
count,89.0,89.0,89.0
mean,45.0,1972.0,43.876404
std,25.836021,25.836021,8.818653
min,1.0,1928.0,29.0
25%,23.0,1950.0,38.0
50%,45.0,1972.0,42.0
75%,67.0,1994.0,49.0
max,89.0,2016.0,76.0


In [40]:
df.dtypes

Index     int64
Year      int64
Age       int64
Name     object
Movie    object
dtype: object

### Indexing and Slicing

In [41]:
sdf.loc['Steve']

Score        2167744
Continent     Europe
Name: Steve, dtype: object

In [42]:
sdf.iloc[2]

Score        3874437
Continent     Europe
Name: Mark, dtype: object

In [43]:
sdf.iloc[-1]

Score        17348075
Continent     America
Name: Tanmay, dtype: object

In [44]:
ss = sdf['Continent']

In [45]:
ss.values

array(['America', 'Europe', 'Europe', 'Europe', 'Asia', 'Europe',
       'America'], dtype=object)

In [46]:
df

Unnamed: 0,Index,Year,Age,Name,Movie
0,1,1928,44,Emil Jannings,"The Last Command, The Way of All Flesh"
1,2,1929,41,Warner Baxter,In Old Arizona
2,3,1930,62,George Arliss,Disraeli
3,4,1931,53,Lionel Barrymore,A Free Soul
4,5,1932,47,Wallace Beery,The Champ
...,...,...,...,...,...
84,85,2012,39,Jean Dujardin,The Artist
85,86,2013,55,Daniel Day-Lewis,Lincoln
86,87,2014,44,Matthew McConaughey,Dallas Buyers Club
87,88,2015,33,Eddie Redmayne,The Theory of Everything\t


In [47]:
df[['Year','Movie']]

Unnamed: 0,Year,Movie
0,1928,"The Last Command, The Way of All Flesh"
1,1929,In Old Arizona
2,1930,Disraeli
3,1931,A Free Soul
4,1932,The Champ
...,...,...
84,2012,The Artist
85,2013,Lincoln
86,2014,Dallas Buyers Club
87,2015,The Theory of Everything\t


In [48]:
df.loc[5:10]

Unnamed: 0,Index,Year,Age,Name,Movie
5,6,1933,35,Fredric March,Dr. Jekyll and Mr. Hyde
6,7,1934,34,Charles Laughton,The Private Life of Henry VIII
7,8,1935,34,Clark Gable,It Happened One Night
8,9,1936,49,Victor McLaglen,The Informer\t
9,10,1937,41,Paul Muni,The Story of Louis Pasteur\t
10,11,1938,37,Spencer Tracy,Captains Courageous


In [55]:
a = df.loc[6:10, ['Age', 'Name']]
a


Unnamed: 0,Age,Name
6,34,Charles Laughton
7,34,Clark Gable
8,49,Victor McLaglen
9,41,Paul Muni
10,37,Spencer Tracy


In [54]:
type(a)

pandas.core.frame.DataFrame

In [56]:
df.loc[df['Age'] > 40]


Unnamed: 0,Index,Year,Age,Name,Movie
0,1,1928,44,Emil Jannings,"The Last Command, The Way of All Flesh"
1,2,1929,41,Warner Baxter,In Old Arizona
2,3,1930,62,George Arliss,Disraeli
3,4,1931,53,Lionel Barrymore,A Free Soul
4,5,1932,47,Wallace Beery,The Champ
8,9,1936,49,Victor McLaglen,The Informer\t
9,10,1937,41,Paul Muni,The Story of Louis Pasteur\t
15,16,1943,43,James Cagney,Yankee Doodle Dandy\t
16,17,1944,48,Paul Lukas,Watch on the Rhine\t
17,18,1945,41,Bing Crosby,Going My Way
