# Pandas: Intro by DataCamp - 1

## Sorting and Subsetting

In [1]:
import pandas as pd
import numpy as np

In [2]:
all_medalists = pd.read_csv('data/all_medalists.csv')

In [3]:
all_medalists.head(5)

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


```python
DataFrame.info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None) → None
```

Print a concise summary of a DataFrame. This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.

In [4]:
all_medalists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29216 entries, 0 to 29215
Data columns (total 10 columns):
City            29216 non-null object
Edition         29216 non-null int64
Sport           29216 non-null object
Discipline      29216 non-null object
Athlete         29216 non-null object
NOC             29216 non-null object
Gender          29216 non-null object
Event           29216 non-null object
Event_gender    29216 non-null object
Medal           29216 non-null object
dtypes: int64(1), object(9)
memory usage: 2.2+ MB


```python
DataFrame.describe(self: ~FrameOrSeries, percentiles=None, include=None, exclude=None) → ~FrameOrSeries
```

Generate descriptive statistics.

Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values.

Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types. The output will vary depending on what is provided. Refer to the notes below for more detail.

In [5]:
pd.options.display.float_format = '{:.4f}'.format
all_medalists.describe(percentiles= [0,0.25,0.5,0.75,0.9,0.95,0.99,1])

Unnamed: 0,Edition
count,29216.0
mean,1967.7132
std,32.4063
min,1896.0
0%,1896.0
25%,1948.0
50%,1976.0
75%,1996.0
90%,2004.0
95%,2008.0


```python
DataFrame.sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False)
```

Sort by the values along either axis.

In [6]:
all_medalists.sort_values('Edition', ascending = False).head(5)

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
29215,Beijing,2008,Wrestling,Wrestling Gre-R,"BAROEV, Khasan",RUS,Men,96 - 120kg,M,Silver
27859,Beijing,2008,Basketball,Basketball,"LISINA, Ekaterina",RUS,Women,basketball,W,Bronze
27846,Beijing,2008,Basketball,Basketball,"GASOL, Pau",ESP,Men,basketball,M,Silver
27847,Beijing,2008,Basketball,Basketball,"JIMENEZ, Carlos",ESP,Men,basketball,M,Silver
27848,Beijing,2008,Basketball,Basketball,"LOPEZ, Raul",ESP,Men,basketball,M,Silver


In [7]:
all_medalists[all_medalists.Edition != 2008]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver
...,...,...,...,...,...,...,...,...,...,...
27169,Athens,2004,Wrestling,Wrestling Gre-R,"GABER IBRAHIM, Karam",EGY,Men,84 - 96kg,M,Gold
27170,Athens,2004,Wrestling,Wrestling Gre-R,"NOZADZE, Ramaz",GEO,Men,84 - 96kg,M,Silver
27171,Athens,2004,Wrestling,Wrestling Gre-R,"GARDNER, Rulon",USA,Men,96 - 120kg,M,Bronze
27172,Athens,2004,Wrestling,Wrestling Gre-R,"BAROEV, Khasan",RUS,Men,96 - 120kg,M,Gold


In [8]:
all_medalists[(all_medalists.Edition <= 2008) & (all_medalists.NOC.isin(['NOC','AUT','GRE']))]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver
5,Athens,1896,Aquatics,Swimming,"CHOROPHAS, Efstathios",GRE,Men,1200m freestyle,M,Bronze
...,...,...,...,...,...,...,...,...,...,...
28749,Beijing,2008,Rowing,Rowing,"POLYMEROS, Vasileios",GRE,Men,lightweight double sculls (2x),M,Silver
28837,Beijing,2008,Sailing,Sailing,"BEKATOROU, Sofia",GRE,Women,Yngling - Keelboat,W,Bronze
28838,Beijing,2008,Sailing,Sailing,"KRAVARIOTI, Virginia",GRE,Women,Yngling - Keelboat,W,Bronze
28839,Beijing,2008,Sailing,Sailing,"PAPADOPOULOU, Sofia",GRE,Women,Yngling - Keelboat,W,Bronze


## Adding new columns

In [9]:
all_medalists['NA_col'] = np.nan
all_medalists.head(4)

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,NA_col
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,


## Summary Statistics

In [10]:
sales = pd.read_csv('data/sales/sales.csv', parse_dates = ['month'])

In [11]:
sales.head()

Unnamed: 0,month,eggs,salt,spam
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52


In [12]:
# Print the head of the sales DataFrame
print(sales.head())

# Print the info about the sales DataFrame
print(sales.info())

# Print the mean of spam
print(sales.spam.mean())

# Print the median of spam
print(sales.spam.median())

  month  eggs    salt  spam
0   Jan    47 12.0000    17
1   Feb   110 50.0000    31
2   Mar   221 89.0000    72
3   Apr    77 87.0000    20
4   May   132     nan    52
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
month    6 non-null object
eggs     6 non-null int64
salt     5 non-null float64
spam     6 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 320.0+ bytes
None
41.166666666666664
41.5


In [13]:
# Print the maximum of the month column
print(sales.month.max())

# Print the minimum of the month column
print(sales.month.min())

May
Apr


In [14]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Print IQR of the spam column
print(sales[['eggs','salt','spam']].agg([iqr, np.median]))

           eggs    salt    spam
iqr    101.5000 37.0000 31.5000
median 121.0000 60.0000 41.5000


## Cumulative statistics


In [15]:
# Sort sales_1_1 by month
sales_1_1 = sales.sort_values('month')

# Get the cumulative sum of spam, add as cum_spam col
sales_1_1['cum_spam'] = sales_1_1.spam.cumsum()

# Get the cumulative max of spam, add as cum_max_spam col
sales_1_1['cum_max_spam'] = sales_1_1.spam.cummax()

# See the columns you calculated
print(sales_1_1[["month", "cum_spam", "cum_max_spam", "spam"]])

  month  cum_spam  cum_max_spam  spam
3   Apr        20            20    20
1   Feb        51            31    31
0   Jan        68            31    17
5   Jun       123            55    55
2   Mar       195            72    72
4   May       247            72    52


## Dropping Duplicates

In [16]:
titanic = pd.read_csv('data/titanic.csv')

In [17]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [18]:
titanic_unique = titanic.drop_duplicates(subset = ['cabin'])

In [19]:
titanic_unique.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3.0,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10.0,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"


## Counting Categorical Variables

```python
Series.value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True)
```

Return a Series containing counts of unique values.

The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default.

In [20]:
titanic_unique['sex'].value_counts()

male      131
female     56
Name: sex, dtype: int64

In [21]:
titanic_unique['sex'].value_counts(ascending=True)

female     56
male      131
Name: sex, dtype: int64

In [22]:
titanic_unique['sex'].value_counts(normalize=True)

male     0.7005
female   0.2995
Name: sex, dtype: float64

## Grouped Summary Statistics

In [23]:
titanic.groupby('sex')['fare'].agg([min, max, sum])

Unnamed: 0_level_0,min,max,sum
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,6.75,512.3292,21528.313
male,0.0,512.3292,22022.1739


## Pivot Tables

```python
pandas.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False) → 'DataFrame'
```

Create a spreadsheet-style pivot table as a DataFrame.

The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame.

In [24]:
titanic.pivot_table(index = 'sex', values = 'fare', aggfunc = 'mean')

Unnamed: 0_level_0,fare
sex,Unnamed: 1_level_1
female,46.1981
male,26.1546


In [25]:
titanic.pivot_table(index = 'sex', values = 'fare', aggfunc = ['mean','median'])

Unnamed: 0_level_0,mean,median
Unnamed: 0_level_1,fare,fare
sex,Unnamed: 1_level_2,Unnamed: 2_level_2
female,46.1981,23.0
male,26.1546,11.8875


In [26]:
titanic.pivot_table(index = 'sex', values = 'fare', columns =  'pclass', aggfunc = ['mean','median'], margins = True)

Unnamed: 0_level_0,mean,mean,mean,mean,median,median,median,median
pclass,1,2,3,All,1,2,3,All
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,109.4124,23.2348,15.3242,46.1981,80.9291,23.0,10.4896,23.0
male,69.8884,19.9049,12.4155,26.1546,49.5042,13.0,7.8958,11.8875
All,87.509,21.1792,13.3029,33.2955,60.0,15.0458,8.05,14.4542


In [27]:
titanic_sex_fare_boat = titanic.pivot_table(index = 'sex', values = 'fare', columns =  'boat', aggfunc = ['mean','median'], 
                    fill_value = 'None')

In [28]:
titanic_sex_fare_boat

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,median,median,median,median,median,median,median,median,median,median
boat,1,10,11,12,13,13 15,13 15 B,14,15,15 16,...,6,7,8,8 10,9,A,B,C,C D,D
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
female,48.2646,70.5076,36.3928,20.2963,13.3072,,,35.4318,13.9048,7.75,...,78.85,71.0,86.5,26.55,15.75,17.9,,8.05,20.525,22.3583
male,46.1236,22.0617,40.8786,14.525,21.2083,7.5104,7.75,24.113,10.5302,,...,18.8625,29.7,,,13.0,7.7958,8.05,15.4937,20.525,26.0


## Calculating on a pivot table

In [29]:
titanic_sex_fare_boat.mean(axis = 'index')

        boat
mean    1       47.1941
        10      46.2847
        11      38.6357
        12      17.4107
        13      17.2577
        14      29.7724
        15      12.2175
        16      14.4651
        2       66.1633
        3      147.1338
        4      111.3451
        5       60.1520
        5 7     52.0000
        6       54.7028
        7       53.5226
        9       23.7921
        A       21.7299
        C       22.2340
        C D     20.5250
        D       35.1849
median  1       51.8531
        10      24.1625
        11      18.1802
        12      17.7625
        13      10.1813
        14      22.1250
        15       9.5291
        16      13.1917
        2       61.1625
        3       95.7500
        4       67.0958
        5       53.9980
        5 7     52.0000
        6       48.8562
        7       50.3500
        9       14.3750
        A       12.8479
        C       11.7719
        C D     20.5250
        D       24.1791
dtype: float64

In [30]:
titanic_sex_fare_boat.mean(axis = 'columns')

sex
female   48.5107
male     30.0703
dtype: float64