## Jon intro to python, lesson 2 (9/3/17)
___

### importing pandas:

In [213]:
import pandas as pd
import numpy as np

### creating a dataframe:

Note: dataframes form the basic building block of pandas, an easy to use, intuitive, and powerful data analysis structure. Much like dataframes in R/Rstudio, pandas datasets are "tidy data": Each variables is saved in a column, and each observation is saved in a row. Tidy datasets preserve observations as you manipulate variables.

In [214]:
# specify values for each column:
df = pd.DataFrame(
    {'a':[4,5,6],
    'b':[7,8,9],
    'c':[10,11,12],},
index = [1,2,3])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [215]:
# specify values for each row:
df = pd.DataFrame(
[[4,5,6],
[7,8,9],
[10,11,12]],
index = [1,2,3],
columns = ['column1', 'b', '3'])
df

Unnamed: 0,column1,b,3
1,4,5,6
2,7,8,9
3,10,11,12


### Indexing in pandas:

Here we want to learn how to subset rows or columns.

In [235]:
# selecting a specific column:
#df['column1']

# alternatively:

df.column1

# operations with selected data?

sum(df.column1)
# selecting multiple columns?

df[['3', 'column1']]
# list all columns names?

df

Unnamed: 0,column1,b,3
1,4,5,6
2,7,8,9
3,10,11,12


In [241]:
# selecting a specific row (via index):
df.iloc[1]

# selecting multiple?

df.iloc[1:3]

# operations with selected data?
sum(df.iloc[1])

24

### Let's create a second dataframe, and see how to append them together:

In [242]:
df2 = pd.DataFrame(
[[np.nan,14,15],
[16,np.nan,18],
[19,20,np.nan]],
index = [1,2,3],
columns = ['d', 'e', 'f'])

df2

Unnamed: 0,d,e,f
1,,14.0,15.0
2,16.0,,18.0
3,19.0,20.0,


In [243]:
# adding a column from df2 to df1:

df['newcol'] = df2['f']
df

Unnamed: 0,column1,b,3,newcol
1,4,5,6,15.0
2,7,8,9,18.0
3,10,11,12,


In [244]:
# adding a calculated column:

df['calc_col'] = df['column1']*df['b']
df

# or:

df['calc_col'] = df.column1*df.b
df

Unnamed: 0,column1,b,3,newcol,calc_col
1,4,5,6,15.0,20
2,7,8,9,18.0,56
3,10,11,12,,110


### concatenating dataframes:

In [245]:
# we have two dataframes of equal size, and are adding df2 to df
df_df2_concat = pd.concat([df, df2], axis=1)
df_df2_concat

Unnamed: 0,column1,b,3,newcol,calc_col,d,e,f
1,4,5,6,15.0,20,,14.0,15.0
2,7,8,9,18.0,56,16.0,,18.0
3,10,11,12,,110,19.0,20.0,


In [246]:
# lets add df2 to the bottom of df:
df_df2_vert_concat = pd.concat([df, df2], ignore_index=True)
df_df2_vert_concat

Unnamed: 0,3,b,calc_col,column1,d,e,f,newcol
0,6.0,5.0,20.0,4.0,,,,15.0
1,9.0,8.0,56.0,7.0,,,,18.0
2,12.0,11.0,110.0,10.0,,,,
3,,,,,,14.0,15.0,
4,,,,,16.0,,18.0,
5,,,,,19.0,20.0,,
