# Import pandas so you can use the library's functions

In [1]:
import pandas as pd

# Read in a csv from a local file

In [2]:
df = pd.read_csv("./mega_awesome_df.csv")
df

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
0,Alanine,"['GCT', 'GCC', 'GCA', 'GCG']",89.1,ALA,A,C3H7N1O2,89.09,6.0,aliphatic
1,Arginine,"['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']",174.2,ARG,R,C6H14N4O2,174.2,11.15,polar basic
2,Asparagine,"['AAT', 'AAC']",132.1,ASN,N,C4H8N2O3,132.12,5.41,polar neutral
3,Cysteine,"['TGT', 'TGC']",121.2,CYS,C,C3H7N1O2S1,121.16,5.02,polar neutral
4,Glutamine,"['CAA', 'CAG']",146.2,GLN,Q,C5H10N2O3,146.15,5.65,polar neutral
5,Glycine,"['GGT', 'GGC', 'GGA', 'GGG']",75.1,GLY,G,C2H5N1O2,75.07,5.97,unique
6,Histidine,"['CAT', 'CAC']",155.2,HIS,H,C6H9N3O2,155.16,7.47,polar basic
7,Isoleucine,"['ATT', 'ATC', 'ATA']",131.2,ILE,I,C6H13N1O2,131.17,5.94,aliphatic
8,Leucine,"['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG']",131.2,LEU,L,C6H13N1O2,131.17,5.98,aliphatic
9,Lysine,"['AAA', 'AAG']",146.2,LYS,K,C6H14N2O2,146.19,9.59,polar basic


# Read in csv from URL

In [3]:
df_from_url = pd.read_csv("https://raw.githubusercontent.com/jayunruh/python_introDS_course/main/2024_class3/mega_awesome_df.csv")

# View the columns and dimensions of a dataframe

In [4]:
# 18 rows, 9 columns
df.shape

(18, 9)

In [5]:
df.columns

Index(['name', 'codons', 'weight', '3_letter_code', '1_letter_code', 'formula',
       'formula_weight', 'isoelectric_point', 'type'],
      dtype='object')

 # View a specific column or group of columns

In [6]:
# View one column
df["name"]

0           Alanine
1          Arginine
2        Asparagine
3          Cysteine
4         Glutamine
5           Glycine
6         Histidine
7        Isoleucine
8           Leucine
9            Lysine
10       Methionine
11    Phenylalanine
12          Proline
13           Serine
14        Threonine
15       Tryptophan
16         Tyrosine
17           Valine
Name: name, dtype: object

# Subset rows based on columns values

In [7]:
# View multiple columns
df[["name", "weight"]]

Unnamed: 0,name,weight
0,Alanine,89.1
1,Arginine,174.2
2,Asparagine,132.1
3,Cysteine,121.2
4,Glutamine,146.2
5,Glycine,75.1
6,Histidine,155.2
7,Isoleucine,131.2
8,Leucine,131.2
9,Lysine,146.2


# View summary stats for columns - max, minimum, mean, and frequency)

In [8]:
df["weight"].max()

204.2

In [9]:
df["weight"].min()

75.1

In [10]:
df["weight"].mean()

136.54999999999995

In [11]:
df["type"].value_counts()

type
aliphatic        5
polar neutral    5
polar basic      3
aromatic         3
unique           2
Name: count, dtype: int64

In [12]:
df[["type", "3_letter_code"]].value_counts()

type           3_letter_code
aliphatic      ALA              1
               ILE              1
               LEU              1
               MET              1
               VAL              1
aromatic       PHE              1
               TRP              1
               TYR              1
polar basic    ARG              1
               HIS              1
               LYS              1
polar neutral  ASN              1
               CYS              1
               GLN              1
               SER              1
               THR              1
unique         GLY              1
               PRO              1
Name: count, dtype: int64

In [13]:
df[df["name"] == "Alanine"]

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
0,Alanine,"['GCT', 'GCC', 'GCA', 'GCG']",89.1,ALA,A,C3H7N1O2,89.09,6.0,aliphatic


In [14]:
df[df["type"].str.contains("polar")]

# Note - what's really happening here is we're taking our series (the column type)
# and then converting all the values to a string (with .str), then asking "does this string contain the string `polar`?"

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
1,Arginine,"['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']",174.2,ARG,R,C6H14N4O2,174.2,11.15,polar basic
2,Asparagine,"['AAT', 'AAC']",132.1,ASN,N,C4H8N2O3,132.12,5.41,polar neutral
3,Cysteine,"['TGT', 'TGC']",121.2,CYS,C,C3H7N1O2S1,121.16,5.02,polar neutral
4,Glutamine,"['CAA', 'CAG']",146.2,GLN,Q,C5H10N2O3,146.15,5.65,polar neutral
6,Histidine,"['CAT', 'CAC']",155.2,HIS,H,C6H9N3O2,155.16,7.47,polar basic
9,Lysine,"['AAA', 'AAG']",146.2,LYS,K,C6H14N2O2,146.19,9.59,polar basic
13,Serine,"['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC']",105.1,SER,S,C3H7N1O3,105.09,5.68,polar neutral
14,Threonine,"['ACT', 'ACC', 'ACA', 'ACG']",119.1,THR,T,C4H9N1O3,119.12,5.64,polar neutral


In [15]:
df[df["weight"] == 155.2]

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
6,Histidine,"['CAT', 'CAC']",155.2,HIS,H,C6H9N3O2,155.16,7.47,polar basic


In [16]:
df[df["weight"] > 100]

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
1,Arginine,"['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']",174.2,ARG,R,C6H14N4O2,174.2,11.15,polar basic
2,Asparagine,"['AAT', 'AAC']",132.1,ASN,N,C4H8N2O3,132.12,5.41,polar neutral
3,Cysteine,"['TGT', 'TGC']",121.2,CYS,C,C3H7N1O2S1,121.16,5.02,polar neutral
4,Glutamine,"['CAA', 'CAG']",146.2,GLN,Q,C5H10N2O3,146.15,5.65,polar neutral
6,Histidine,"['CAT', 'CAC']",155.2,HIS,H,C6H9N3O2,155.16,7.47,polar basic
7,Isoleucine,"['ATT', 'ATC', 'ATA']",131.2,ILE,I,C6H13N1O2,131.17,5.94,aliphatic
8,Leucine,"['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG']",131.2,LEU,L,C6H13N1O2,131.17,5.98,aliphatic
9,Lysine,"['AAA', 'AAG']",146.2,LYS,K,C6H14N2O2,146.19,9.59,polar basic
10,Methionine,['ATG'],149.2,MET,M,C5H11N1O2S1,149.21,5.74,aliphatic
11,Phenylalanine,"['TTT', 'TTC']",165.2,PHE,F,C9H11N1O2,165.19,5.48,aromatic


# 

# Subset rows on multiple column values 

In [17]:
df[(df["type"] == "polar basic") & (df["isoelectric_point"] > 11)]

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
1,Arginine,"['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']",174.2,ARG,R,C6H14N4O2,174.2,11.15,polar basic


# Manipulate columns

- Note that the df.columns type is just a special type of list, so we can manipulate our column names just like how we manipulate lists

In [18]:
print(type(df.columns))
print(df.columns[0])
old_column_names = df.columns
df.columns = ["NAME", "CODONS", "WEIGHT", "3_letter_code", "1_letter_code", "FORMULA", "FORMULA_WEIGHT", "isoelectric_whatever", "tyyyyype"]
df

<class 'pandas.core.indexes.base.Index'>
name


Unnamed: 0,NAME,CODONS,WEIGHT,3_letter_code,1_letter_code,FORMULA,FORMULA_WEIGHT,isoelectric_whatever,tyyyyype
0,Alanine,"['GCT', 'GCC', 'GCA', 'GCG']",89.1,ALA,A,C3H7N1O2,89.09,6.0,aliphatic
1,Arginine,"['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']",174.2,ARG,R,C6H14N4O2,174.2,11.15,polar basic
2,Asparagine,"['AAT', 'AAC']",132.1,ASN,N,C4H8N2O3,132.12,5.41,polar neutral
3,Cysteine,"['TGT', 'TGC']",121.2,CYS,C,C3H7N1O2S1,121.16,5.02,polar neutral
4,Glutamine,"['CAA', 'CAG']",146.2,GLN,Q,C5H10N2O3,146.15,5.65,polar neutral
5,Glycine,"['GGT', 'GGC', 'GGA', 'GGG']",75.1,GLY,G,C2H5N1O2,75.07,5.97,unique
6,Histidine,"['CAT', 'CAC']",155.2,HIS,H,C6H9N3O2,155.16,7.47,polar basic
7,Isoleucine,"['ATT', 'ATC', 'ATA']",131.2,ILE,I,C6H13N1O2,131.17,5.94,aliphatic
8,Leucine,"['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG']",131.2,LEU,L,C6H13N1O2,131.17,5.98,aliphatic
9,Lysine,"['AAA', 'AAG']",146.2,LYS,K,C6H14N2O2,146.19,9.59,polar basic


In [19]:
df.columns = old_column_names
df

Unnamed: 0,name,codons,weight,3_letter_code,1_letter_code,formula,formula_weight,isoelectric_point,type
0,Alanine,"['GCT', 'GCC', 'GCA', 'GCG']",89.1,ALA,A,C3H7N1O2,89.09,6.0,aliphatic
1,Arginine,"['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']",174.2,ARG,R,C6H14N4O2,174.2,11.15,polar basic
2,Asparagine,"['AAT', 'AAC']",132.1,ASN,N,C4H8N2O3,132.12,5.41,polar neutral
3,Cysteine,"['TGT', 'TGC']",121.2,CYS,C,C3H7N1O2S1,121.16,5.02,polar neutral
4,Glutamine,"['CAA', 'CAG']",146.2,GLN,Q,C5H10N2O3,146.15,5.65,polar neutral
5,Glycine,"['GGT', 'GGC', 'GGA', 'GGG']",75.1,GLY,G,C2H5N1O2,75.07,5.97,unique
6,Histidine,"['CAT', 'CAC']",155.2,HIS,H,C6H9N3O2,155.16,7.47,polar basic
7,Isoleucine,"['ATT', 'ATC', 'ATA']",131.2,ILE,I,C6H13N1O2,131.17,5.94,aliphatic
8,Leucine,"['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG']",131.2,LEU,L,C6H13N1O2,131.17,5.98,aliphatic
9,Lysine,"['AAA', 'AAG']",146.2,LYS,K,C6H14N2O2,146.19,9.59,polar basic
