![sslogo](https://github.com/stratascratch/stratascratch.github.io/raw/master/assets/sslogo.jpg)

# Quick Rendering Hack

#### For results requiring multiple tables, use the display class to format your results

In [None]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)
    

In [1]:
import numpy as np
import pandas as pd

# Operating on Null Values

In [4]:
data = pd.Series([3.1414, np.nan, 'hello world', None])
data

0         3.1414
1            NaN
2    hello world
3           None
dtype: object

#### Create a boolean mask for rows containing a non-null value

In [3]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

#### Create a table containing all non-null values from data using a masking operation

In [5]:
data[data.notnull()]

0         3.1414
2    hello world
dtype: object

### Dropping null values

In [6]:
df = pd.DataFrame([[1,      np.nan, 2     ],
                   [2,      3,      5     ],
                   [np.nan, 4,      6     ],
                   [np.nan, np.nan, np.nan],
                   [6     , np.nan, np.nan]])
df

Unnamed: 0,0,1,2
0,1.0,,2.0
1,2.0,3.0,5.0
2,,4.0,6.0
3,,,
4,6.0,,


#### Create a table with all non-null values from data using the dropna method

In [8]:
data.dropna()

0         3.1414
2    hello world
dtype: object

#### Create a table will all non-null rows from df

In [9]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5.0


#### Create a table will all columns with less than three non-null values from df

In [11]:
df.dropna(axis='columns', thresh=3)

Unnamed: 0,0,2
0,1.0,2.0
1,2.0,5.0
2,,6.0
3,,
4,6.0,


### Filling null values

In [12]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

#### Create a table with all null values of data filled with 1

In [13]:
data.fillna(1)

a    1.0
b    1.0
c    2.0
d    1.0
e    3.0
dtype: float64

#### Create a table with all null values of data filled using the forward-fill method

In [14]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

#### Create a table with all null values of data filled using the back-fill method

In [15]:
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

For ``DataFrame``s, the options are similar, but we can also specify an ``axis`` along which the fills take place:

In [16]:
df

Unnamed: 0,0,1,2
0,1.0,,2.0
1,2.0,3.0,5.0
2,,4.0,6.0
3,,,
4,6.0,,


#### Create a table will all null values of df filled using  the forward-fill method down each column

In [18]:
df.fillna(method='ffill', axis='rows')

Unnamed: 0,0,1,2
0,1.0,,2.0
1,2.0,3.0,5.0
2,2.0,4.0,6.0
3,2.0,4.0,6.0
4,6.0,4.0,6.0


# Connect to the Strata Scratch Datasets To Explore Our Datasets

This notebook will guide you through the process of connecting to the Strata Scratch database to query datasets. Simply download this notebook and follow the steps.

## Connect to Strata Scratch

Remember to input your username and password

## Pull Data From Strata Scratch

In [19]:
db_data = pd.read_csv('../datasets/nfl_combine.csv')

In [20]:
db_data.head()

Unnamed: 0,year,name,firstname,lastname,position,heightfeet,heightinches,heightinchestotal,weight,arms,...,vertical,broad,bench,round,college,pick,pickround,picktotal,wonderlic,nflgrade
0,2015,Ameer Abdullah,Ameer,Abdullah,RB,5,9.0,69.0,205,0.0,...,42.5,130.0,24.0,0.0,Nebraska,,0,0,0.0,5.9
1,2015,Nelson Agholor,Nelson,Agholor,WR,6,0.0,72.0,198,0.0,...,0.0,0.0,12.0,0.0,USC,,0,0,0.0,5.6
2,2015,Jay Ajayi,Jay,Ajayi,RB,6,0.0,72.0,221,0.0,...,39.0,121.0,19.0,0.0,Boise St.,,0,0,0.0,6.0
3,2015,Kwon Alexander,Kwon,Alexander,OLB,6,1.0,73.0,227,0.0,...,36.0,121.0,24.0,0.0,LSU,,0,0,0.0,5.4
4,2015,Mario Alford,Mario,Alford,WR,5,8.0,68.0,180,0.0,...,34.0,121.0,13.0,0.0,West Virginia,,0,0,0.0,5.3


# Basic Pandas Functionality

#### Print the information about db_data

In [21]:
db_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               1000 non-null   int64  
 1   name               1000 non-null   object 
 2   firstname          1000 non-null   object 
 3   lastname           1000 non-null   object 
 4   position           1000 non-null   object 
 5   heightfeet         1000 non-null   int64  
 6   heightinches       1000 non-null   float64
 7   heightinchestotal  1000 non-null   float64
 8   weight             1000 non-null   int64  
 9   arms               1000 non-null   float64
 10  hands              1000 non-null   float64
 11  fortyyd            1000 non-null   float64
 12  twentyyd           1000 non-null   float64
 13  tenyd              1000 non-null   float64
 14  twentyss           1000 non-null   float64
 15  threecone          1000 non-null   float64
 16  vertical           1000 n

#### Print the first 8 rows of db_data

In [22]:
db_data.head(8)

Unnamed: 0,year,name,firstname,lastname,position,heightfeet,heightinches,heightinchestotal,weight,arms,...,vertical,broad,bench,round,college,pick,pickround,picktotal,wonderlic,nflgrade
0,2015,Ameer Abdullah,Ameer,Abdullah,RB,5,9.0,69.0,205,0.0,...,42.5,130.0,24.0,0.0,Nebraska,,0,0,0.0,5.9
1,2015,Nelson Agholor,Nelson,Agholor,WR,6,0.0,72.0,198,0.0,...,0.0,0.0,12.0,0.0,USC,,0,0,0.0,5.6
2,2015,Jay Ajayi,Jay,Ajayi,RB,6,0.0,72.0,221,0.0,...,39.0,121.0,19.0,0.0,Boise St.,,0,0,0.0,6.0
3,2015,Kwon Alexander,Kwon,Alexander,OLB,6,1.0,73.0,227,0.0,...,36.0,121.0,24.0,0.0,LSU,,0,0,0.0,5.4
4,2015,Mario Alford,Mario,Alford,WR,5,8.0,68.0,180,0.0,...,34.0,121.0,13.0,0.0,West Virginia,,0,0,0.0,5.3
5,2015,Javorius Allen,Javorius,Allen,RB,6,0.0,72.0,221,0.0,...,35.5,121.0,11.0,0.0,USC,,0,0,0.0,5.3
6,2015,Adrian Amos,Adrian,Amos,FS,6,0.0,72.0,218,0.0,...,35.5,122.0,0.0,0.0,Penn St.,,0,0,0.0,5.5
7,2015,Dres Anderson,Dres,Anderson,WR,6,1.0,73.0,187,0.0,...,0.0,0.0,13.0,0.0,Utah,,0,0,0.0,5.5


#### Print the last 10 rows of db_data

In [23]:
db_data.tail(10)

Unnamed: 0,year,name,firstname,lastname,position,heightfeet,heightinches,heightinchestotal,weight,arms,...,vertical,broad,bench,round,college,pick,pickround,picktotal,wonderlic,nflgrade
990,2012,Greg Childs,Greg,Childs,WR,6,3.0,75.0,219,34.125,...,36.0,125.0,19.0,4.0,Arkansas,39(134),39,39,0.0,0.0
991,2012,Morris Claiborne,Morris,Claiborne,CB,5,11.0,71.0,188,33.25,...,34.0,118.0,0.0,1.0,LSU,6(6),6,6,4.0,0.0
992,2012,Danny Coale,Danny,Coale,WR,6,0.0,72.0,201,30.625,...,35.0,115.0,12.0,5.0,Virginia Tech,17(152),17,17,0.0,0.0
993,2012,Audie Cole,Audie,Cole,ILB,6,4.0,76.0,246,0.0,...,35.0,78.0,15.0,7.0,North Carolina State,3(210),3,3,0.0,0.0
994,2012,B.J. Coleman,B.J.,Coleman,QB,6,3.0,75.0,233,31.625,...,0.0,109.0,0.0,7.0,Chattanooga,36(243),36,36,0.0,0.0
995,2012,Tom Compton,Tom,Compton,OT,6,5.0,77.0,314,34.0,...,30.0,108.0,20.0,6.0,South Dakota,23(193),23,23,0.0,0.0
996,2012,Josh Cooper,Josh,Cooper,WR,5,11.0,71.0,190,31.0,...,31.5,115.0,11.0,0.0,,,0,0,0.0,0.0
997,2012,Quinton Coples,Quinton,Coples,DE,6,6.0,78.0,284,33.25,...,31.5,109.0,25.0,1.0,North Carolina,16(16),16,16,0.0,0.0
998,2012,Paul Cornick,Paul,Cornick,OT,6,5.0,77.0,310,35.0,...,0.0,0.0,25.0,0.0,,,0,0,0.0,0.0
999,2012,Aaron Corp,Aaron,Corp,QB,6,4.0,76.0,215,32.0,...,31.5,106.0,0.0,0.0,,,0,0,0.0,0.0


#### Print the summary statistics of db_data

In [25]:
db_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,1000.0,2013.923,0.909889,2012.0,2013.0,2014.0,2015.0,2015.0
heightfeet,1000.0,5.777,0.416467,5.0,6.0,6.0,6.0,6.0
heightinches,1000.0,4.455,3.45039,0.0,2.0,4.0,6.4375,11.125
heightinchestotal,1000.0,73.779,2.765161,66.0,72.0,74.0,76.0,80.25
weight,1000.0,244.327,45.872305,156.0,207.0,234.5,283.0,369.0
arms,1000.0,10.681375,15.32073,0.0,0.0,0.0,31.5,36.75
hands,1000.0,3.1435,4.5142,0.0,0.0,0.0,9.125,11.0
fortyyd,1000.0,4.03942,1.752945,0.0,4.46,4.63,4.91,5.74
twentyyd,1000.0,0.02894,0.27466,0.0,0.0,0.0,0.0,2.75
tenyd,1000.0,0.41385,0.711509,0.0,0.0,0.0,1.46,1.92


### Filtering Dataframes

Remember: the syntax for filtering DataFrames is similar to that used for boolean masking

In [26]:
data = pd.read_csv('../datasets/titanic.csv')

In [27]:
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Filter the data for females

Remember: the head method can be used to make results more concise

In [28]:
data[data['sex'] == 'female']

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


#### Filter for names and age of passengers with fares greater than 50

In [30]:
data.loc[data['fare'] > 50, ['name', 'age']]

Unnamed: 0,name,age
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
6,"McCarthy, Mr. Timothy J",54.0
27,"Fortune, Mr. Charles Alexander",19.0
31,"Spencer, Mrs. William Augustus (Marie Eugenie)",
...,...,...
856,"Wick, Mrs. George Dennick (Mary Hitchcock)",45.0
863,"Sage, Miss. Dorothy Edith ""Dolly""",
867,"Roebling, Mr. Washington Augustus II",31.0
871,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",47.0


### Adding methods to filters


#### Print the number of second class passengers

In [32]:
data.pclass[data['pclass'] == 2].count()

184

#### Print the survival rate of males in the third passenger class

survival rate = mean of survived column

In [34]:
data.survived[(data['sex'] == 'male') & (data['pclass'] == 3)].mean()

0.13544668587896252

#### Print the survival rate for women and children

In [39]:
data[(data['sex'] == 'female') | (data['age'] < 18)]['survived'].mean()

0.6881720430107527

#### Print the survival rate of men. Then, print the survival rate of women. 

This type of of group based aggregation will become simpler in the next section

In [40]:
print(data[data['sex'] == 'male']['survived'].mean())
print(data[data['sex'] == 'female']['survived'].mean())

0.18890814558058924
0.7420382165605095
