In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
csvPathFile = os.path.join(os.getcwd(), 'roster.csv')
print(csvPathFile)

/Users/student/helloworld/roster.csv


### Read CSV to DataFrame

In [3]:
roster = pd.read_csv(csvPathFile)
print(type(roster))

<class 'pandas.core.frame.DataFrame'>


#### Viewing the data

In [4]:
roster.head()

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V


In [5]:
roster.tail()

Unnamed: 0,name
17,Hsin-Yun
18,Renata
19,Max
20,Joshua
21,David


In [6]:
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


### Modifying the Data

In [7]:
d = {'name': ['Wally']}
tmp_df = pd.DataFrame(data=d)
roster = pd.concat([roster, tmp_df], ignore_index=True)
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


#### Assign Grades


In [8]:
np.random.seed(1)
roster['grade'] = np.random.randint(0, 100, size=len(roster))
roster

Unnamed: 0,name,grade
0,Joe,37
1,Jihuan,12
2,Ali,72
3,Frances,9
4,Daniela V,75
5,Mostafa,5
6,Daniela P,79
7,Cesar,64
8,Jarrod,16
9,Austin,1


.loc can be used with a boolean array (i.e. of 1s and 0s)

In [9]:
roster['name'] == "Daniela P"

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
Name: name, dtype: bool

In [10]:
roster.loc[roster['name'] == 'Daniela P', 'grade'] = 100
roster

Unnamed: 0,name,grade
0,Joe,37
1,Jihuan,12
2,Ali,72
3,Frances,9
4,Daniela V,75
5,Mostafa,5
6,Daniela P,100
7,Cesar,64
8,Jarrod,16
9,Austin,1


### Check the class average

Each column pandas dataframe is a series object -- which have dozens of buil-in methods

In [11]:
roster['grade'].mean()

37.95652173913044

In [12]:
roster.loc[roster['grade'] < 50, 'grade'] = roster['grade'] + 40
roster

Unnamed: 0,name,grade
0,Joe,77
1,Jihuan,52
2,Ali,72
3,Frances,49
4,Daniela V,75
5,Mostafa,45
6,Daniela P,100
7,Cesar,64
8,Jarrod,56
9,Austin,41


In [13]:
roster['grade'].mean()

62.30434782608695

In [15]:
roster.loc[roster['grade'] < 60, 'grade'] = roster['grade'] + 30
roster['grade'].mean()

76.65217391304348

### Write to CSV

In [16]:
outFilePath = os.path.join(os.getcwd(), 'roster_pandas.csv')
print(outFilePath)

/Users/student/helloworld/roster_pandas.csv


In [17]:
roster.to_csv(outFilePath, index=False)

### More Aggregation and Manipulation

In [22]:
np.random.choice(['red', 'blue'], size=len(roster))

array(['blue', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red',
       'blue', 'blue', 'blue', 'blue', 'blue', 'red', 'red', 'red',
       'blue', 'red', 'blue', 'blue', 'blue', 'red'], dtype='<U4')

In [23]:
np.random.seed(2)
roster['group'] = np.random.choice(['red', 'blue'], size=len(roster))
roster

Unnamed: 0,name,grade,group
0,Joe,77,red
1,Jihuan,82,blue
2,Ali,72,blue
3,Frances,79,red
4,Daniela V,75,red
5,Mostafa,75,blue
6,Daniela P,100,red
7,Cesar,64,blue
8,Jarrod,86,red
9,Austin,71,blue


In [27]:
group_means = roster.groupby(by=['group']).mean()
group_means

Unnamed: 0_level_0,grade
group,Unnamed: 1_level_1
blue,74.461538
red,79.5


In [29]:
group_means.rename(columns={'grade': 'group_avg'}, inplace=True)
group_means

Unnamed: 0_level_0,group_avg
group,Unnamed: 1_level_1
blue,74.461538
red,79.5


### Merging DataFrames

In [30]:
print(roster.shape)
print(group_means.shape)

(23, 3)
(2, 1)


In [31]:
roster = roster.merge(group_means, on=['group'])
roster.shape

(23, 4)

In [32]:
roster

Unnamed: 0,name,grade,group,group_avg
0,Joe,77,red,79.5
1,Frances,79,red,79.5
2,Daniela V,75,red,79.5
3,Daniela P,100,red,79.5
4,Jarrod,86,red,79.5
5,Jack,76,red,79.5
6,Renata,81,red,79.5
7,Max,68,red,79.5
8,Joshua,69,red,79.5
9,David,84,red,79.5


### Creating new columns from custom functions

In [39]:
def is_top50(col):
    return col > col.median()

In [40]:
roster['top50'] = roster[['grade']].apply(is_top50)

In [41]:
roster

Unnamed: 0,name,grade,group,group_avg,top50
0,Joe,77,red,79.5,True
1,Frances,79,red,79.5,True
2,Daniela V,75,red,79.5,False
3,Daniela P,100,red,79.5,True
4,Jarrod,86,red,79.5,True
5,Jack,76,red,79.5,False
6,Renata,81,red,79.5,True
7,Max,68,red,79.5,False
8,Joshua,69,red,79.5,False
9,David,84,red,79.5,True
