In [1]:
########################################################
# Purpose: Pandas Tutorial
# Course: Data Engineering - MSc. Statistics AUEB
# Author: Marilena Kalatzi
# Date: May 2024
########################################################

In [2]:
# import pandas library
import pandas as pd

In [3]:
# create a list of names 
column = ["Mariya", "Batman", "Spongebob"]
print(column)

# convert list to a data frame
column_data = pd.DataFrame(column)
print(column_data)

['Mariya', 'Batman', 'Spongebob']
           0
0     Mariya
1     Batman
2  Spongebob


In [4]:
# create a dictionary to assign names to columns inside the data frame
titled_column = {"name": column}

# convert dictionary to data frame with titled columns
data = pd.DataFrame(titled_column)
print(data)

        name
0     Mariya
1     Batman
2  Spongebob


In [5]:
# add other columns in the data frame
titled_columns = {"name": column,
                 "height" : [1.67, 1.9, 0.25],
                 "weight" : [54,100,1]}
data = pd.DataFrame(titled_columns)
print(data)

        name  height  weight
0     Mariya    1.67      54
1     Batman    1.90     100
2  Spongebob    0.25       1


In [6]:
# select specific columns from a data frame
select_columns = data["weight"]
print(select_columns)

0     54
1    100
2      1
Name: weight, dtype: int64


In [7]:
# index specific columns
index_columns = data["weight"][1]
print(index_columns)

100


In [8]:
# select specific row; iloc represents rows, use brackets to index entire row
select_row = data.iloc[1]
print(select_row)

name      Batman
height       1.9
weight       100
Name: 1, dtype: object


In [9]:
# index rows
index_row = data.iloc[1]["weight"]
print(index_row)

100


In [10]:
# manipulate data frame values
# add new column with calculated bmi from data frame's values
data["bmi"] = data["weight"]/(data["height"]**2)
print(data)
    

        name  height  weight        bmi
0     Mariya    1.67      54  19.362473
1     Batman    1.90     100  27.700831
2  Spongebob    0.25       1  16.000000


In [12]:
# save data frame to a file
data.to_csv("bmi.csv", sep = "\t")
data.to_csv("bmi.txt", sep = "\t")

In [13]:
# load data frame from existing file
load_data = pd.read_csv("bmi.csv", sep = "\t")
print(load_data)

   Unnamed: 0       name  height  weight        bmi
0           0     Mariya    1.67      54  19.362473
1           1     Batman    1.90     100  27.700831
2           2  Spongebob    0.25       1  16.000000


In [14]:
# re-save the file but make sure first column that appeared before is not there
data.to_csv("bmi.csv",index = False, sep = "\t")

In [15]:
# re-load to see the change
load_data = pd.read_csv("bmi.csv",sep = "\t")

In [16]:
# print only first row of loaded data
load_data.head(1)
# print only the last 2 rows of loaded data
load_data.tail(2)

Unnamed: 0,name,height,weight,bmi
1,Batman,1.9,100,27.700831
2,Spongebob,0.25,1,16.0


In [17]:
# filter data frame
filtered_data = data[data["weight"] == 54]
print(filtered_data)

     name  height  weight        bmi
0  Mariya    1.67      54  19.362473


In [18]:
# replace a column in the data frame
replaced_weight = data.replace(54, 78)
print(replaced_weight)

        name  height  weight        bmi
0     Mariya    1.67      78  19.362473
1     Batman    1.90     100  27.700831
2  Spongebob    0.25       1  16.000000


In [19]:
# remove a column (axis = 1)
remove_column = data.drop("height",axis = 1)
print(remove_column)

        name  weight        bmi
0     Mariya      54  19.362473
1     Batman     100  27.700831
2  Spongebob       1  16.000000


In [20]:
# remove a row (axis = 0)
remove_row = data.drop(1,axis=0)
print(remove_row)

        name  height  weight        bmi
0     Mariya    1.67      54  19.362473
2  Spongebob    0.25       1  16.000000


In [21]:
# add new rows

# create new row dictionary
row = {"name": ["Marilena"],
       "height": 1.73,
       "weight": 74,
       "bmi": 74/(1.73**2)}

# convert dictionary to data frame
new_row = pd.DataFrame(row)

# add new row data frame to previous data
new_row_data = pd.concat([data, new_row], ignore_index=True)
print(new_row_data)

        name  height  weight        bmi
0     Mariya    1.67      54  19.362473
1     Batman    1.90     100  27.700831
2  Spongebob    0.25       1  16.000000
3   Marilena    1.73      74  24.725183


In [5]:
# pandas series
series = pd.Series([10,20,30,400],["A","B","C","D"])
print(series)

A     10
B     20
C     30
D    400
dtype: int64


In [6]:
# index series
print(series["C"])

30
