# Various Python Notes

In [2]:
import pandas as pd

## itertuples()

In [3]:
df = pd.DataFrame(data={
   'Number': range(1, 1000000)
})

In [8]:
%%time
# standard way
total = 0
for _, row in df.iterrows():
    total += row['Number']
 
total

CPU times: user 1min 46s, sys: 1.82 s, total: 1min 47s
Wall time: 2min 14s


499999500000

In [9]:
%%time
# using itertuples
total = 0
for row in df.itertuples(index=False):
    total += row.Number
 
total

CPU times: user 844 ms, sys: 26.5 ms, total: 871 ms
Wall time: 1.18 s


499999500000

## nlargest() and nsmallest()

In [10]:
df = pd.DataFrame(data={
    'Name': ['Bob', 'Mark', 'Josh', 'Anna', 'Peter', 'Dexter'],
    'Points': [37, 91, 66, 42, 99, 81]
})

In [11]:
df['Points'].nlargest(3)

4    99
1    91
5    81
Name: Points, dtype: int64

In [12]:
df.nlargest(3, columns='Points')

Unnamed: 0,Name,Points
4,Peter,99
1,Mark,91
5,Dexter,81


In [13]:
df.nsmallest(3, columns='Points')

Unnamed: 0,Name,Points
0,Bob,37
3,Anna,42
2,Josh,66


## cut()  

In [14]:
df = pd.DataFrame(data={
    'Name': ['Bob', 'Mark', 'Josh', 'Anna', 'Peter', 'Dexter'],
    'Points': [37, 91, 66, 42, 99, 81]
})

In [15]:
pd.cut(df['Points'], bins=2)

0    (36.938, 68.0]
1      (68.0, 99.0]
2    (36.938, 68.0]
3    (36.938, 68.0]
4      (68.0, 99.0]
5      (68.0, 99.0]
Name: Points, dtype: category
Categories (2, interval[float64]): [(36.938, 68.0] < (68.0, 99.0]]

In [16]:
pd.cut(df['Points'], bins=[0, 50, 100])

0      (0, 50]
1    (50, 100]
2    (50, 100]
3      (0, 50]
4    (50, 100]
5    (50, 100]
Name: Points, dtype: category
Categories (2, interval[int64]): [(0, 50] < (50, 100]]

In [17]:
pd.cut(df['Points'], bins=[0, 50, 100], labels=['Fail', 'Pass'])

0    Fail
1    Pass
2    Pass
3    Fail
4    Pass
5    Pass
Name: Points, dtype: category
Categories (2, object): [Fail < Pass]

## Selecting From Pandas

### By integer location  

In [33]:
df = pd.DataFrame(data={
    'Food': ['2 cups cooked rice', '1 garlic', '1 onion', '1 tsp oil', '4 oz chicken', '1 tsp soy sauce'],
    'Calories': [411, 4, 32, 40, 155, 1],
    'Protein': [8, 0, 1, 0, 29, 0],
    'Carbs': [89, 0, 8, 0, 0, 0],
    'Fat': [1, 0, 0, 5, 2, 0]

})

In [34]:
df

Unnamed: 0,Food,Calories,Protein,Carbs,Fat
0,2 cups cooked rice,411,8,89,1
1,1 garlic,4,0,0,0
2,1 onion,32,1,8,0
3,1 tsp oil,40,0,0,5
4,4 oz chicken,155,29,0,2
5,1 tsp soy sauce,1,0,0,0


In [None]:
# Single selections using iloc and DataFrame
# Keep in mind that dataframes have 0-based indexes

In [21]:
# Rows:
#df.iloc[0] # first row of data frame 
#df.iloc[1] # second row of data frame 
df.iloc[-1] # last row of data frame 

Food        1 tsp soy sauce
Calories                  1
Protein                   0
Carbs                     0
Fat                       0
Name: 5, dtype: object

In [22]:
# Columns:
df.iloc[:,0] # first column of data frame 
#df.iloc[:,1] # second column of data frame 
#df.iloc[:,-1] # last column of data frame 

0    2 cups cooked rice
1              1 garlic
2               1 onion
3             1 tsp oil
4          4 oz chicken
5       1 tsp soy sauce
Name: Food, dtype: object

In [23]:
# Multiple row and column selections using iloc and DataFrame
#df.iloc[0:5] # first five rows of dataframe
#df.iloc[:, 0:2] # first two columns of data frame with all rows
df.iloc[[0,2,4], [0,3]] # 1st, 3rd, 5th, + 1st and 4th columns
#df.iloc[0:5, 1:4] # first 5 rows and 1st, 2nd and 3rd columns

Unnamed: 0,Food,Carbs
0,2 cups cooked rice,89
2,1 onion,8
4,4 oz chicken,0


### By label  

In [35]:
df.loc[0:2, 'Food':'Protein']

Unnamed: 0,Food,Calories,Protein
0,2 cups cooked rice,411,8
1,1 garlic,4,0
2,1 onion,32,1


In [36]:
df.loc[[0, 4], ['Food', 'Fat', 'Calories']]

Unnamed: 0,Food,Fat,Calories
0,2 cups cooked rice,1,411
4,4 oz chicken,2,155


Optionally, if you can choose a column that will serve as the index of the dataframe. The values in this column will be our row index. Remember, rows are indexed by integer locations by default.

In [25]:
df.set_index("Food",inplace=True)

In [26]:
df

Unnamed: 0_level_0,Calories,Protein,Carbs,Fat
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2 cups cooked rice,411,8,89,1
1 garlic,4,0,0,0
1 onion,32,1,8,0
1 tsp oil,40,0,0,5
4 oz chicken,155,29,0,2
1 tsp soy sauce,1,0,0,0


In [27]:
df.loc['1 onion']

Calories    32
Protein      1
Carbs        8
Fat          0
Name: 1 onion, dtype: int64

In [28]:
df.loc[['1 onion','4 oz chicken']]

Unnamed: 0_level_0,Calories,Protein,Carbs,Fat
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 onion,32,1,8,0
4 oz chicken,155,29,0,2


In [29]:
df.loc[:'4 oz chicken']   # slice UP TO Chicken!

Unnamed: 0_level_0,Calories,Protein,Carbs,Fat
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2 cups cooked rice,411,8,89,1
1 garlic,4,0,0,0
1 onion,32,1,8,0
1 tsp oil,40,0,0,5
4 oz chicken,155,29,0,2


In [32]:
df['Protein']

Food
2 cups cooked rice     8
1 garlic               0
1 onion                1
1 tsp oil              0
4 oz chicken          29
1 tsp soy sauce        0
Name: Protein, dtype: int64

### Splitting into subsets (random)  

In [38]:
df_foods_1 = df.sample(frac=0.6, random_state=999)
df_foods_2 = df.drop(df_foods_1.index)

In [39]:
df_foods_1

Unnamed: 0,Food,Calories,Protein,Carbs,Fat
2,1 onion,32,1,8,0
5,1 tsp soy sauce,1,0,0,0
3,1 tsp oil,40,0,0,5
1,1 garlic,4,0,0,0


In [40]:
df_foods_2

Unnamed: 0,Food,Calories,Protein,Carbs,Fat
0,2 cups cooked rice,411,8,89,1
4,4 oz chicken,155,29,0,2
