In [1]:
import pandas as pd
import numpy as np

# 1: Print DataFrame in Markdown-friendly format

In [2]:
df = pd.DataFrame({'a': [1, 2, 3, 4],
                   'b': [5, 6, 7, 8]})

# You can control the printing of the index column by using the flag index.
print(df.to_markdown(index=True))

|    |   a |   b |
|---:|----:|----:|
|  0 |   1 |   5 |
|  1 |   2 |   6 |
|  2 |   3 |   7 |
|  3 |   4 |   8 |


In [3]:
# Ouput markdown with a tabulate option
print(df.to_markdown(tablefmt="grid", index=True))

+----+-----+-----+
|    |   a |   b |
+====+=====+=====+
|  0 |   1 |   5 |
+----+-----+-----+
|  1 |   2 |   6 |
+----+-----+-----+
|  2 |   3 |   7 |
+----+-----+-----+
|  3 |   4 |   8 |
+----+-----+-----+


In [4]:
# To create a markdown file from the dataframe, pass
# the file name as paramters
print(df.to_markdown("README.md", tablefmt="grid", index=True))

None


# 2: Group rows into a list

In [5]:
df = pd.DataFrame(
    {
        "col1": [1, 2, 3, 4, 3],
        "col2": ["a", "a", "b", "b", "c"],
        "col3": ["d", "e", "f", "g", "h"],
    }
)
df

Unnamed: 0,col1,col2,col3
0,1,a,d
1,2,a,e
2,3,b,f
3,4,b,g
4,3,c,h


In [6]:
# Group by col2
df.groupby(["col2"]).agg(
    {
        "col1": "mean",            # get mean
        "col3": lambda x: list(x)  # get list
    }
)

Unnamed: 0_level_0,col1,col3
col2,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.5,"[d, e]"
b,3.5,"[f, g]"
c,3.0,[h]


# 3: DataFrame.explode()

In [7]:
df = pd.DataFrame({"a": ["1,2", "4,5"],
                   "b": [11, 13]})
df

Unnamed: 0,a,b
0,12,11
1,45,13


In [8]:
# Turn strings into lists
df.a = df.a.str.split(",")
df

Unnamed: 0,a,b
0,"[1, 2]",11
1,"[4, 5]",13


In [9]:

df.explode("a", ignore_index=False)

Unnamed: 0,a,b
0,1,11
0,2,11
1,4,13
1,5,13


# 4: DataFrame.copy()

In [10]:
df = pd.DataFrame({"col1": [1, 2, 3],
                   "col2": [4, 5, 6]})
df

Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6


In [11]:
df2 = df  # Make a copy using =
df2["col1"] = [7, 8, 9]
df  # df also changes

Unnamed: 0,col1,col2
0,7,4
1,8,5
2,9,6


In [12]:
# Recrate df
df = pd.DataFrame({"col1": [1, 2, 3],
                   "col2": [4, 5, 6]})

df3 = df.copy()  # Create a copy of df
df3["col1"] = [7, 8, 9]
df  # df doesn't change

Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6


# 5: Groupby().count vs Groupby( ).size

In [13]:
df = pd.DataFrame(
    {
        "col1": ["a", "b", "b", "c", "c", "d"],
        "col2": ["S", "S", "M", "L", "L", "L"]
    }
)
df

Unnamed: 0,col1,col2
0,a,S
1,b,S
2,b,M
3,c,L
4,c,L
5,d,L


In [14]:
# get the count of elements in one column
df.groupby(["col1"]).count()

Unnamed: 0_level_0,col2
col1,Unnamed: 1_level_1
a,1
b,2
c,2
d,1


In [15]:
# Get the size of groups of 2+ columns
df.groupby(["col1", "col2"]).size()

col1  col2
a     S       1
b     M       1
      S       1
c     L       2
d     L       1
dtype: int64

# 6: Correlation

In [16]:
df1 = pd.DataFrame({
    "a": [1, 2, 3, 4],
    "b": [2, 3, 4, 6]
})

df2 = pd.DataFrame({
    "a": [1, 2, 3, 3],
    "b": [2, 2, 5, 4]
})

In [17]:
df1

Unnamed: 0,a,b
0,1,2
1,2,3
2,3,4
3,4,6


In [18]:
df2

Unnamed: 0,a,b
0,1,2
1,2,2
2,3,5
3,3,4


In [19]:
df1.corrwith(df2)

a    0.94388
b    0.68313
dtype: float64

# 7: Cross-Tabulation

In [20]:
network = [
    ("Ben", "Smith"),
    ("Ben", "Patrick"),
    ("Warren", "Jone"),
    ("Warren", "Smith"),
    ("Smith", "Patrick"),
]

In [21]:
# Create a dataframe of the network
friends1 = pd.DataFrame(
    network, columns=["person1", "person2"]
)
friends1

Unnamed: 0,person1,person2
0,Ben,Smith
1,Ben,Patrick
2,Warren,Jone
3,Warren,Smith
4,Smith,Patrick


In [22]:
# Create the order of the columns
friends2 = pd.DataFrame(
    network, columns=["person2", "person1"]
)
friends2

Unnamed: 0,person2,person1
0,Ben,Smith
1,Ben,Patrick
2,Warren,Jone
3,Warren,Smith
4,Smith,Patrick


In [23]:
# Create a symmetric dataframe
friends = pd.concat([friends1, friends2])
friends

Unnamed: 0,person1,person2
0,Ben,Smith
1,Ben,Patrick
2,Warren,Jone
3,Warren,Smith
4,Smith,Patrick
0,Smith,Ben
1,Patrick,Ben
2,Jone,Warren
3,Smith,Warren
4,Patrick,Smith


In [24]:
# Create a cross tabulation
pd.crosstab(friends.person1, friends.person2)

person2,Ben,Jone,Patrick,Smith,Warren
person1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ben,0,0,1,1,0
Jone,0,0,0,0,1
Patrick,1,0,0,1,0
Smith,1,0,1,0,1
Warren,0,1,0,1,0


# 8: DataFrame.query()

In [25]:
df = pd.DataFrame({
    "fruit": ["apple", "orange", "grape", "grape"],
    "price": [4, 5, 6, 7]
})
df

Unnamed: 0,fruit,price
0,apple,4
1,orange,5
2,grape,6
3,grape,7


In [26]:
# Filter using brackets
df[(df.price > 4) & (df.fruit == "grape")]

Unnamed: 0,fruit,price
2,grape,6
3,grape,7


In [27]:
# Filter using query
df.query("price > 4 & fruit == 'grape'")

Unnamed: 0,fruit,price
2,grape,6
3,grape,7


# 9: Unpivot DataFrame

In [28]:
df = pd.DataFrame({
    "fruit": ["apple", "orange"],
    "Aldi": [4, 5],
    "Walmart": [6, 7],
    "Costco": [1, 2]
})
df

Unnamed: 0,fruit,Aldi,Walmart,Costco
0,apple,4,6,1
1,orange,5,7,2


In [29]:
# Turn Aldi, Walmart, Costco into values of "store"
df.melt(id_vars=["fruit"],
        value_vars=["Aldi", "Walmart", "Costco"],
        var_name='store')

Unnamed: 0,fruit,store,value
0,apple,Aldi,4
1,orange,Aldi,5
2,apple,Walmart,6
3,orange,Walmart,7
4,apple,Costco,1
5,orange,Costco,2


# 10: Rename aggregated column

In [30]:
df = pd.DataFrame({"size": ["S", "S", "M", "L"],
                   "price": [44, 29.99, 10, 19]})
df

Unnamed: 0,size,price
0,S,44.0
1,S,29.99
2,M,10.0
3,L,19.0


In [31]:
df.groupby('size').agg({'price': 'mean'})

Unnamed: 0_level_0,price
size,Unnamed: 1_level_1
L,19.0
M,10.0
S,36.995


In [32]:
# Assign name to the aggregation
df.groupby('size').agg(
    mean_price=('price', 'mean')   # name = (column, agg_method)
)

Unnamed: 0_level_0,mean_price
size,Unnamed: 1_level_1
L,19.0
M,10.0
S,36.995


# 11: Normalized Value Counts

In [33]:
size = pd.Series(["S", "S", "M", "L", "S", "XL", "S", "M",])

# Get count of each value
size.value_counts()

S     4
M     2
L     1
XL    1
dtype: int64

In [34]:
# Get percentage of each value
size.value_counts(normalize=True)

S     0.500
M     0.250
L     0.125
XL    0.125
dtype: float64

# 12: df.transform() instead of df.count()

In [35]:
df = pd.DataFrame({
    "type": ["A", "A", "O", "B", "O", "A"],
    "value": [5, 3, 2, 1, 7, 3]
})
df

Unnamed: 0,type,value
0,A,5
1,A,3
2,O,2
3,B,1
4,O,7
5,A,3


In [36]:
# Using count will throw an error because the
# Series returned is shorter than the original
# DataFrame

# df.loc[df.groupby("type")["type"].count() > 1]
df.loc[df.groupby("type")["type"].transform("size") > 1]

Unnamed: 0,type,value
0,A,5
1,A,3
2,O,2
4,O,7
5,A,3


In [37]:
df.loc[df["value"] > 1]  # My own version

Unnamed: 0,type,value
0,A,5
1,A,3
2,O,2
4,O,7
5,A,3


# 13: Fill in Null Values

In [38]:
store1 = pd.DataFrame({
    "orange": [None, 5, 9],
    "apple": [4, None, 12]
})
store1

Unnamed: 0,orange,apple
0,,4.0
1,5.0,
2,9.0,12.0


In [39]:
store2 = pd.DataFrame({
    "orange": [31, 52, 91],
    "apple": [11, 71, 21]
})
store2

Unnamed: 0,orange,apple
0,31,11
1,52,71
2,91,21


In [40]:
# Fill null values of the store1 with values at the same
# locations from store2
store1.combine_first(store2)

Unnamed: 0,orange,apple
0,31.0,4.0
1,5.0,71.0
2,9.0,12.0


# 14: Value Counts Missing Values

In [41]:
size = pd.Series(["S", "S", None, "M", "L", "S", None, "XL", "S", "M",])
size

0       S
1       S
2    None
3       M
4       L
5       S
6    None
7      XL
8       S
9       M
dtype: object

In [42]:
# Get count of each value, it does not count missing values
size.value_counts()

S     4
M     2
L     1
XL    1
dtype: int64

In [43]:
# pass dropna=False to get missing value count
size.value_counts(dropna=False)

S       4
None    2
M       2
L       1
XL      1
dtype: int64

# 15: Filter Columns in DataFrame

In [44]:
df = pd.DataFrame({'Temp': ['Hot', 'Cold', 'Warm', 'Cold'],
                   'Degree': [35, 3, 15, 2]})
df

Unnamed: 0,Temp,Degree
0,Hot,35
1,Cold,3
2,Warm,15
3,Cold,2


In [45]:
df = pd.get_dummies(df, columns=['Temp'])
df

Unnamed: 0,Degree,Temp_Cold,Temp_Hot,Temp_Warm
0,35,0,1,0
1,3,1,0,0
2,15,0,0,1
3,2,1,0,0


In [46]:
df.filter(like='Temp', axis=1)

Unnamed: 0,Temp_Cold,Temp_Hot,Temp_Warm
0,0,1,0
1,1,0,0
2,0,0,1
3,1,0,0


# 16: Convert Data Types Automatically

In [47]:
df = pd.DataFrame(
    {
        "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
        "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
        "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
        "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
        "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
        "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
    }
)
df

Unnamed: 0,a,b,c,d,e,f
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


In [48]:
df.dtypes

a      int32
b     object
c     object
d     object
e    float64
f    float64
dtype: object

In [49]:
new_df = df.convert_dtypes()
new_df.dtypes

a      Int32
b     string
c    boolean
d     string
e      Int64
f    Float64
dtype: object

# 17: Assign new columns to a DataFrame

In [50]:
time_sentences = ["Saturday: Weekend (Not working day)",
                  "Sunday: Weekend (Not working day)",
                  "Monday: Doctor appointment at 2:45pm.",
                  "Tuesday: Dentist appointment at 11:30 am.",
                  "Wednesday: basketball game At 7:00pm",
                  "Thursday: Back home by 11:15 pm.",
                  "Friday: Take the train at 08:10 am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Saturday: Weekend (Not working day)
1,Sunday: Weekend (Not working day)
2,Monday: Doctor appointment at 2:45pm.
3,Tuesday: Dentist appointment at 11:30 am.
4,Wednesday: basketball game At 7:00pm
5,Thursday: Back home by 11:15 pm.
6,Friday: Take the train at 08:10 am.


In [51]:
# Use Assign instead of using direct assignment 
# df['text'] = df.text.str.lower()
# df['text_len'] = df.text.str.len()
# df['word_count'] = df.text.str.count(" ") + 1
# df['weekend'] = df.text.str.contains("saturday|sunday", case=False)
print((
    df
    .assign(text=df.text.str.lower(),
            text_len=df.text.str.len(),
            word_count=df.text.str.count(" ") + 1,
            weekend=df.text.str.contains("saturday|sunday", case=False),
           )
))

                                        text  text_len  word_count  weekend
0        saturday: weekend (not working day)        35           5     True
1          sunday: weekend (not working day)        33           5     True
2      monday: doctor appointment at 2:45pm.        37           5    False
3  tuesday: dentist appointment at 11:30 am.        41           6    False
4       wednesday: basketball game at 7:00pm        36           5    False
5           thursday: back home by 11:15 pm.        32           6    False
6        friday: take the train at 08:10 am.        35           7    False


# 18: Read HTML Tables

In [52]:
# Without a marcher we will get a list of all tables in the 
# page. To make a table selection, pass table title to the 
# match parameter
table = pd.read_html(
    "https://en.wikipedia.org/wiki/Minnesota", 
    match="United States presidential election results for Minnesota"
)
table

[    Year Republican         Democratic         Third party        
     Year        No.       %        No.       %         No.       %
 0   2020    1484065  45.28%    1717077  52.40%       76029   2.32%
 1   2016    1323232  44.93%    1367825  46.44%      254176   8.63%
 2   2012    1320225  44.96%    1546167  52.65%       70169   2.39%
 3   2008    1275409  43.82%    1573354  54.06%       61606   2.12%
 4   2004    1346695  47.61%    1445014  51.09%       36678   1.30%
 5   2000    1109659  45.50%    1168266  47.91%      160760   6.59%
 6   1996     766476  34.96%    1120438  51.10%      305726  13.94%
 7   1992     747841  31.85%    1020997  43.48%      579110  24.66%
 8   1988     962337  45.90%    1109471  52.91%       24982   1.19%
 9   1984    1032603  49.54%    1036364  49.72%       15482   0.74%
 10  1980     873241  42.56%     954174  46.50%      224538  10.94%
 11  1976     819395  42.02%    1070440  54.90%       60096   3.08%
 12  1972     898269  51.58%     802346  46.07% 

In [53]:
print(table[0].head())

   Year Republican         Democratic         Third party       
   Year        No.       %        No.       %         No.      %
0  2020    1484065  45.28%    1717077  52.40%       76029  2.32%
1  2016    1323232  44.93%    1367825  46.44%      254176  8.63%
2  2012    1320225  44.96%    1546167  52.65%       70169  2.39%
3  2008    1275409  43.82%    1573354  54.06%       61606  2.12%
4  2004    1346695  47.61%    1445014  51.09%       36678  1.30%


In [54]:
table = pd.read_html(
    "https://en.wikipedia.org/wiki/Minnesota", 
    match="Average daily"
)
table

[              Location July (°F) July (°C) January (°F) January (°C)
 0          Minneapolis     83/64     28/18         23/7       −4/−13
 1           Saint Paul     83/63     28/17         23/6       −5/−14
 2            Rochester     82/63     28/17         23/3       −5/−16
 3               Duluth     76/55     24/13         19/1       −7/−17
 4            St. Cloud     81/58     27/14        18/−1       −7/−18
 5              Mankato     86/62     30/16         23/3       −5/−16
 6  International Falls     77/52     25/11        15/−6       −9/−21]

In [55]:
print(table[0].head())

      Location July (°F) July (°C) January (°F) January (°C)
0  Minneapolis     83/64     28/18         23/7       −4/−13
1   Saint Paul     83/63     28/17         23/6       −5/−14
2    Rochester     82/63     28/17         23/3       −5/−16
3       Duluth     76/55     24/13         19/1       −7/−17
4    St. Cloud     81/58     27/14        18/−1       −7/−18


# 19: ‘nlargest’ and ‘nsmallest ‘

In [56]:
df = pd.read_csv('data/imdbratings.csv',
                 usecols=['star_rating', 'title', 'genre', 'duration'])
df.head()

Unnamed: 0,star_rating,title,genre,duration
0,9.3,The Shawshank Redemption,Crime,142
1,9.2,The Godfather,Crime,175
2,9.1,The Godfather: Part II,Crime,200
3,9.0,The Dark Knight,Action,152
4,8.9,Pulp Fiction,Crime,154


In [57]:
df.nlargest(5, "duration")

Unnamed: 0,star_rating,title,genre,duration
476,7.8,Hamlet,Drama,242
157,8.2,Gone with the Wind,Drama,238
78,8.4,Once Upon a Time in America,Crime,229
142,8.3,Lagaan: Once Upon a Time in India,Adventure,224
445,7.9,The Ten Commandments,Adventure,220


In [58]:
df.nsmallest(5, "duration")

Unnamed: 0,star_rating,title,genre,duration
389,8.0,Freaks,Drama,64
338,8.0,Battleship Potemkin,History,66
258,8.1,The Cabinet of Dr. Caligari,Crime,67
88,8.4,The Kid,Comedy,68
293,8.1,Duck Soup,Comedy,68


# 20: Create a Rank Column

In [59]:
df = pd.DataFrame({'Students': ['John', 'Smith', 'Patrick', 'Bob', 'Jose'],
                   'Marks': [80, 56, 95, 75, 45]})
df

Unnamed: 0,Students,Marks
0,John,80
1,Smith,56
2,Patrick,95
3,Bob,75
4,Jose,45


In [60]:
df["Rank"] = df["Marks"].rank(ascending=False)
df

Unnamed: 0,Students,Marks,Rank
0,John,80,2.0
1,Smith,56,4.0
2,Patrick,95,1.0
3,Bob,75,3.0
4,Jose,45,5.0


# 21: Color Values in DataFrame

In [61]:
df = pd.DataFrame({'Students': ['John', 'Smith', 'Patrick', 'Bob', 'Jose'],
                   'Physics': [80, 56, 95, 75, 45], 
                   'Mathematics': [90, 85, 55, 65, 75]})
df.set_index('Students', inplace=True)
df

Unnamed: 0_level_0,Physics,Mathematics
Students,Unnamed: 1_level_1,Unnamed: 2_level_1
John,80,90
Smith,56,85
Patrick,95,55
Bob,75,65
Jose,45,75


In [62]:
def pass_condition(val):
    color = 'blue' if val > 70 else 'red'
    return f"background-color: {color}"

df.style.applymap(pass_condition)

Unnamed: 0_level_0,Physics,Mathematics
Students,Unnamed: 1_level_1,Unnamed: 2_level_1
John,80,90
Smith,56,85
Patrick,95,55
Bob,75,65
Jose,45,75
