Source: https://pandas.pydata.org/docs/getting_started/intro_tutorials/02_read_write.html

In [1]:
import pandas as pd

## What kind of data does pandas handle? 

In [4]:
df = pd.DataFrame(
    { 
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [20,25,30],
        "Sex": ['male', 'male', 'female'],
    }
)

In [3]:
df

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",20,male
1,"Allen, Mr. William Henry",25,male
2,"Bonnell, Miss. Elizabeth",30,female


In [15]:
df['Age']

0    20
1    25
2    30
Name: Age, dtype: int64

In [25]:
# create series from a scratch
ages = pd.Series([20,25,30], name = 'Age')
ages


0    20
1    25
2    30
Name: Age, dtype: int64

In [26]:
# max age
df['Age'].max()

30

In [27]:
ages.max()

30

In [32]:
# basic stats of numerical data 
df.describe()

Unnamed: 0,Age
count,3.0
mean,25.0
std,5.0
min,20.0
25%,22.5
50%,25.0
75%,27.5
max,30.0


In [29]:
ages.describe()

count     3.0
mean     25.0
std       5.0
min      20.0
25%      22.5
50%      25.0
75%      27.5
max      30.0
Name: Age, dtype: float64

## How do I read and write tabular data?

In [33]:
titanic = pd.read_csv("titanic.csv")

In [36]:
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
# types of each columns 
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [42]:
titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False)

In [43]:
titanic2 = pd.read_excel("titanic.xlsx", sheet_name="passengers")

In [44]:
titanic2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
# technical summary of dataframe
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## How do I select a subset of a DataFrame?

In [46]:
ages = titanic['Age']

In [47]:
ages.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [49]:
ages.shape

(891,)

In [50]:
titanic.shape


(891, 12)

In [54]:
age_sex = titanic[['Age', 'Sex']]

In [52]:
age_sex.head()

Unnamed: 0,Age,Sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male


In [56]:
# select rows of passengers older than 35
above_35 = titanic[titanic['Age'] > 35]

In [57]:
above_35.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S


In [60]:
above_35.shape

(217, 12)

In [61]:
titanic['Pclass'].unique()

array([3, 1, 2])

In [64]:
# passengers from cabin class 2 and 3
titanic[titanic['Pclass'] > 1].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [66]:
titanic[titanic['Pclass'].isin([2,3])].shape

(675, 12)

In [70]:
titanic[(titanic['Pclass'] == 2) | (titanic['Pclass'] == 3)].shape

(675, 12)

In [74]:
# Passenger data for which age is known
titanic[titanic['Age'].notna()].shape

(714, 12)

In [76]:
titanic[titanic['Age'].isna()].shape

(177, 12)

In [79]:
# Names of passengers older than 35 years
names_above35 = titanic.loc[titanic["Age"] > 35, "Name"]

In [80]:
names_above35.head()

1     Cumings, Mrs. John Bradley (Florence Briggs Th...
6                               McCarthy, Mr. Timothy J
11                             Bonnell, Miss. Elizabeth
13                          Andersson, Mr. Anders Johan
15                     Hewlett, Mrs. (Mary D Kingcome) 
Name: Name, dtype: object

In [81]:
# Rows 10 to 25, columns 3 to 5
titanic.iloc[9:25, 2:5]

Unnamed: 0,Pclass,Name,Sex
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",female
10,3,"Sandstrom, Miss. Marguerite Rut",female
11,1,"Bonnell, Miss. Elizabeth",female
12,3,"Saundercock, Mr. William Henry",male
13,3,"Andersson, Mr. Anders Johan",male
14,3,"Vestrom, Miss. Hulda Amanda Adolfina",female
15,2,"Hewlett, Mrs. (Mary D Kingcome)",female
16,3,"Rice, Master. Eugene",male
17,2,"Williams, Mr. Charles Eugene",male
18,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female


In [82]:
# Set the first 3 elements of the 3rd column to anonymous
titanic.iloc[0:3, 3] = "anonymous"

In [83]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,anonymous,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,anonymous,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,anonymous,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
