In [13]:
# Generating dataframe

import pandas as pd
import numpy as np

# Take a 2D array as input to your DataFrame 
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(my_2darray)
print("---")

# Take a dictionary as input to your DataFrame 
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print(pd.DataFrame(my_dict))
print("---")

# Take a DataFrame as input to your DataFrame 
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print(my_df)
print("---")

# Take a Series as input to your DataFrame
my_series = pd.Series({"Belgium":"Brussels", "India":"New Delhi", "United Kingdom":"London", "United States":"Washington"})
print(pd.DataFrame(my_series))

[[1 2 3]
 [4 5 6]]
---
   1  2  3
0  1  1  2
1  3  2  4
---
   A
0  4
1  5
2  6
3  7
---
                         0
Belgium           Brussels
India            New Delhi
United Kingdom      London
United States   Washington


In [11]:
# Getting shape of a dataframe

df = pd.DataFrame(np.array([[4, 5, 6, 7], [4, 5, 6, 7], [4, 5, 6, 7]]),\
                          columns=np.array(["a", "b", "c", "d"]))

# Use the `shape` property
print(df.shape)

# Or use the `len()` function with the `index` property
for i in df.index:
    print(df.iloc[i])
    print("-----")
print('columns: ' + str(df.columns))
print('length of rows' + str(len(df.index)))

(3, 4)
a    4
b    5
c    6
d    7
Name: 0, dtype: int64
-----
a    4
b    5
c    6
d    7
Name: 1, dtype: int64
-----
a    4
b    5
c    6
d    7
Name: 2, dtype: int64
-----
columns: Index(['a', 'b', 'c', 'd'], dtype='object')
length of rows3


In [24]:
# Fetching data from DataFrame with iloc, loc, at

df = pd.DataFrame(data=["apple",3,"orange",5], columns=['A'], index=range(4))
# Using `iloc[]`
print(df.iloc[1][0])

# Using `loc[]`
print(df.loc[0]['A'])

# Using `at[]`
print(df.at[2,'A'])

# Using `iat[]`
print(df.iat[3,0])

3
apple
orange
5


In [28]:
# Fetching data from DataFrame with ":"

df = pd.DataFrame(data=["apple",3,"orange",5], columns=['A'], index=range(4))
# Use `iloc[]` to select row `0`
print(df.iloc[0])

# Use `loc[]` to select column `'A'`
print(df.loc[:, 'A'])

# Use `loc[]` to select column `'A'`
print(df.loc[1:, 'A'])

A    apple
Name: 0, dtype: object
0     apple
1         3
2    orange
3         5
Name: A, dtype: object
1         3
2    orange
3         5
Name: A, dtype: object


In [40]:
# set_index

df = pd.DataFrame(data=np.array([["apple",3,"orange",5], ['r', 's', 'b', 'x']]), columns=['A', 'B', 'C', 'D'], index=range(2))
print(df)
print("---")
df = df.set_index('C')
print(df)

       A  B       C  D
0  apple  3  orange  5
1      r  s       b  x
---
            A  B  D
C                  
orange  apple  3  5
b           r  s  x


In [45]:
# set_index, loc, iloc

df = pd.DataFrame(data=np.array([["apple",3,"orange",5], ['r', 's', 'b', 'x']]), columns=['A', 'B', 'C', 'D'])
df = df.set_index('C')
print(df.loc['b'])
print("---")
print(df.iloc[0])

A    r
B    s
D    x
Name: b, dtype: object
---
A    apple
B        3
D        5
Name: orange, dtype: object


In [55]:
# Adding data

df = pd.DataFrame(data=np.array([["apple",3,"orange",5], ['r', 's', 'b', 'x']]), columns=['A', 'B', 'C', 'D'])
df = df.set_index('C')
df.loc["newRow"] = ["e", "b", "a"]
print(df)

            A  B  D
C                  
orange  apple  3  5
b           r  s  x
newRow      e  b  a


In [56]:
# Adding column

df = pd.DataFrame(data=np.array([["apple",3,"orange",5], ['r', 's', 'b', 'x']]), columns=['A', 'B', 'C', 'D'])
df = df.set_index('C')
df['e'] = ["foo", "bar"]
print(df)

            A  B  D    e
C                       
orange  apple  3  5  foo
b           r  s  x  bar


In [60]:
# Adding column with loc

df = pd.DataFrame(np.array([[1,2,3,4], [5,6,7,8]]), columns=['A','B','C','D'])
print(df)
print("---")
df.loc[:, 'E'] = ['x', 'y']
print(df)

   A  B  C  D
0  1  2  3  4
1  5  6  7  8
---
   A  B  C  D  E
0  1  2  3  4  x
1  5  6  7  8  y


In [64]:
# Resetting index
df = pd.DataFrame(np.array([[1,2,3,4], [5,6,7,8]]), columns=['A','B','C','D'], index=[1.2, 2.5])

# Check out the weird index of your dataframe
print(df)
print("---")
# Use `reset_index()` to reset the values. 
print(df.reset_index(level=0, drop=True))
print("---")
print(df.reset_index(level=0, drop=False))


     A  B  C  D
1.2  1  2  3  4
2.5  5  6  7  8
---
   A  B  C  D
0  1  2  3  4
1  5  6  7  8
---
   index  A  B  C  D
0    1.2  1  2  3  4
1    2.5  5  6  7  8


In [65]:
# Dropping column
df = pd.DataFrame(np.array([[1,2,3,4], [5,6,7,8]]), columns=['A','B','C','D'])

# Check out the DataFrame `df`
print(df)
print("---")

# Drop the column with label 'A'                  
df.drop('A', axis=1, inplace=True) # 行の場合はaxis = 0, inplaceにTrueをセットすることで対象自体を変更できる
print(df)
print("---")

# Drop the column at position 1
print(df.drop(df.columns[[1]], axis=1))
print("---")
print(df)

   A  B  C  D
0  1  2  3  4
1  5  6  7  8
---
   B  C  D
0  2  3  4
1  6  7  8
---
   B  D
0  2  4
1  6  8
---
   B  C  D
0  2  3  4
1  6  7  8


In [76]:
# Removing duplicates

df = pd.DataFrame(np.array([[1,2,3,4], [1,2,3,4], [2,2,3,4]]), columns=['A','B','C','D'])

# Check out your DataFrame `df`
print(df)

# Drop the duplicates in `df`
df.drop_duplicates(['A'], keep='last') #keep = 'last'の場合、最後のものをのこす

   A  B  C  D
0  1  2  3  4
1  1  2  3  4
2  2  2  3  4


Unnamed: 0,A,B,C,D
1,1,2,3,4
2,2,2,3,4


In [79]:
# Renaming columns or index

df = pd.DataFrame(np.array([[1,2,3], [2,3,4], [2,2,3]]), columns=['A','B','C'])

# Check out your DataFrame `df`
print(df)

# Define the new names of your columns
newcols = {
    'A': 'new_column_1', 
    'B': 'new_column_2', 
    'C': 'new_column_3'
}

# Use `rename()` to rename your columns
df.rename(columns=newcols, inplace=True)
display(df)

# Use `rename()` to your index
display(df.rename(index={1: 'a'}))

   A  B  C
0  1  2  3
1  2  3  4
2  2  2  3


Unnamed: 0,new_column_1,new_column_2,new_column_3
0,1,2,3
1,2,3,4
2,2,2,3


Unnamed: 0,new_column_1,new_column_2,new_column_3
0,1,2,3
a,2,3,4
2,2,2,3


In [83]:
# Replacing

df = pd.DataFrame(np.array([[1,2,3], [2,3,4], [2,2,3]]), columns=['A','B','C'])

# Check out your DataFrame `df`
display(df)
display(df.replace([1, 2],['x', 'y']))


Unnamed: 0,A,B,C
0,1,2,3
1,2,3,4
2,2,2,3


Unnamed: 0,A,B,C
0,x,y,3
1,y,3,4
2,y,y,3


In [86]:
# Replacing with regex

df = pd.DataFrame(np.array([["a",2,3], ["b",3,4], ["c","q",3]]), columns=['A','B','C'])

# Check out your DataFrame `df`
display(df)
display(df.replace({'[a-c]':'#'}, regex=True))

Unnamed: 0,A,B,C
0,a,2,3
1,b,3,4
2,c,q,3


Unnamed: 0,A,B,C
0,#,2,3
1,#,3,4
2,#,q,3


In [91]:
# Removing unwanted parts

df = pd.DataFrame(np.array([["-1a",2,3], ["-2b",3,4], ["+3c","q",3]]), columns=['A','B','C'])
display(df)
df['A'] = df['A'].map(lambda x: x.lstrip('+-'))
display(df)

Unnamed: 0,A,B,C
0,-1a,2,3
1,-2b,3,4
2,+3c,q,3


Unnamed: 0,A,B,C
0,1a,2,3
1,2b,3,4
2,3c,q,3


In [164]:
# Splitting data

df = pd.DataFrame(np.array([["- 1a",2,3], ["- 2b",3,4], ["+ 3c","q",3]]), columns=['A','B','C'])

# Inspect your DataFrame `df`
display(df)

# Split out the two values in the third row
# Make it a Series
# Stack the values
a_series = df['A'].str.split(' ').apply(pd.Series)
a_series.rename(columns={0:"E", 1:"F"}, inplace=True)

# Get rid of the stack:
# Drop the level to line up with the DataFrame
#a_series.index = a_series.index.droplevel(-1)
display(a_series)

# Delete the `Ticket` column from your DataFrame
del df['A']

# Join the `ticketdf` DataFrame to `df`
df = df.join(pd.DataFrame(data=a_series))

# Check out the new `df`
display(df)

Unnamed: 0,A,B,C
0,- 1a,2,3
1,- 2b,3,4
2,+ 3c,q,3


Unnamed: 0,E,F
0,-,1a
1,-,2b
2,+,3c


Unnamed: 0,B,C,E,F
0,2,3,-,1a
1,3,4,-,2b
2,q,3,+,3c


In [170]:
# apply function to specified row and columns

doubler = lambda x : x*2

df = pd.DataFrame(data=[[1,2,3], [4,5,6], [7,8,9]], columns=['A', 'B', 'C'])
display(df)
display(df.applymap(doubler))

display(pd.DataFrame(df['B'].apply(doubler)))
display(pd.DataFrame(df.loc[1].apply(doubler)))
display()
df.loc[1] = df.loc[1].apply(doubler)
display(df)
df.loc[:, 'A'] = df.A.apply(doubler)
display(df)

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


Unnamed: 0,A,B,C
0,2,4,6
1,8,10,12
2,14,16,18


Unnamed: 0,B
0,4
1,10
2,16


Unnamed: 0,1
A,8
B,10
C,12


Unnamed: 0,A,B,C
0,1,2,3
1,8,10,12
2,7,8,9


Unnamed: 0,A,B,C
0,2,2,3
1,16,10,12
2,14,8,9


In [175]:
# empty dataframe
display(pd.DataFrame(np.nan, columns=['A'], index=range(4)))

Unnamed: 0,A
0,
1,
2,
3,


In [201]:
# create new derived table out of original one
df = pd.DataFrame([["a","foo",3, 100],["a","bar",6,200],["b","baz",9,300],["b", "bar", 10,400]], columns=['A', 'B', 'C', 'D'])
display(df)
display(df.pivot(columns='B', index='A'))
display(df.pivot(columns='B', index='A', values='D'))

Unnamed: 0,A,B,C,D
0,a,foo,3,100
1,a,bar,6,200
2,b,baz,9,300
3,b,bar,10,400


Unnamed: 0_level_0,C,C,C,D,D,D
B,bar,baz,foo,bar,baz,foo
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,6.0,,3.0,200.0,,100.0
b,10.0,9.0,,400.0,300.0,


B,bar,baz,foo
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,200.0,,100.0
b,400.0,300.0,


In [208]:
# if you can't ensure uniquness of data, you can use pivot_table
# aggfunc argument is used to aggregate values.
# default aggfunc is 'mean'
df = pd.DataFrame([["a","foo",3, 100],["a","bar",6,200],["b","bar",9,300],["b", "bar", 10,400]], columns=['A', 'B', 'C', 'D'])
display(df)
display(df.pivot_table(index=['A', 'B']))
display(df.pivot_table(index=['A', 'B'], aggfunc='sum'))

Unnamed: 0,A,B,C,D
0,a,foo,3,100
1,a,bar,6,200
2,b,bar,9,300
3,b,bar,10,400


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
a,bar,6.0,200.0
a,foo,3.0,100.0
b,bar,9.5,350.0


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
a,bar,6,200
a,foo,3,100
b,bar,19,700


In [210]:
# Reshape Your DataFrame With melt()
# The `people` DataFrame
people = pd.DataFrame({'FirstName' : ['John', 'Jane'],
                       'LastName' : ['Doe', 'Austen'],
                       'BloodType' : ['A-', 'B+'],
                       'Weight' : [90, 64]})

display(people)
# Use `melt()` on the `people` DataFrame
display(pd.melt(people, id_vars=['FirstName', 'LastName'], var_name='measurements'))

Unnamed: 0,BloodType,FirstName,LastName,Weight
0,A-,John,Doe,90
1,B+,Jane,Austen,64


Unnamed: 0,FirstName,LastName,measurements,value
0,John,Doe,BloodType,A-
1,Jane,Austen,BloodType,B+
2,John,Doe,Weight,90
3,Jane,Austen,Weight,64


In [224]:
# Iterating data

df = pd.DataFrame([["a","foo",3, 100],["a","bar",6,200],["b","bar",9,300],["b", "bar", 10,400]], columns=['A', 'B', 'C', 'D'])

for index, row in df.iterrows():
    display(row['B'])

'foo'

'bar'

'bar'

'bar'