# Pandas Crash Course

In [2]:
import numpy as np
import pandas as pd

## Basics

### Creating a DataFrame

In [7]:
data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
columns = ['temperature', 'activity']

dataframe = pd.DataFrame(data=data, columns=columns)
print(dataframe)

   temperature  activity
0            1         2
1            3         4
2            5         6
3            7         8
4            9        10


### Adding a New Column to a DataFrame

In [8]:
dataframe['adjusted'] = dataframe['activity'] + 2
print(dataframe)

   temperature  activity  adjusted
0            1         2         4
1            3         4         6
2            5         6         8
3            7         8        10
4            9        10        12


### Specifying a Subset of a DataFrame

In [12]:
print('Rows 0, 1, and 2:')
print(dataframe.head(3))

Rows 0, 1, and 2:
   temperature  activity  adjusted
0            1         2         4
1            3         4         6
2            5         6         8


In [13]:
print('Row 2:')
print(dataframe.iloc[2])

Row 2:
temperature    5
activity       6
adjusted       8
Name: 2, dtype: int64


In [14]:
print('Rows 1, 2, and 3:')
print(dataframe[1:4])

Rows 1, 2, and 3:
   temperature  activity  adjusted
1            3         4         6
2            5         6         8
3            7         8        10


In [15]:
print('Temperature column')
print(dataframe['temperature'])

Temperature column
0    1
1    3
2    5
3    7
4    9
Name: temperature, dtype: int64


## DataFrame Practice

Do the following:

1. Create a 3x4 pandas DataFrame in which the columns are named `Eleanor`,
   `Chidi`, `Tahani`, and `Jason`. Populate each of the 12 cells in the
   DataFrame with a random integer between 0 and 100, inclusive.
1. Output the following:
   - The entire DataFrame.
   - The value in the cell of row 1 of the Eleanor column.
1. Create a fifth column named `Janet`, which is populated with the row-by-row
   sums of `Tahani` and `Jason`.

In [27]:
data = np.random.randint(101, size=(3, 4))
columns = [
    'Eleanor',
    'Chidi',
    'Tahani',
    'Jason',
]

dataframe = pd.DataFrame(data, columns=columns)

print(dataframe)
print(dataframe['Eleanor'].iloc[1])

dataframe['Janet'] = dataframe['Tahani'] + dataframe['Jason']
print(dataframe)

   Eleanor  Chidi  Tahani  Jason
0       82     55       3     77
1       25     41      48     64
2       68     47      67     15
25
   Eleanor  Chidi  Tahani  Jason  Janet
0       82     55       3     77     80
1       25     41      48     64    112
2       68     47      67     15     82


Pandas provides two different ways to duplicate a DataFrame:

- **Reference**: If you assign a DataFrame to a new variable, any change to the
  DataFrame or to the new variable will be reflected in the other.
- **Copying**: If you call the `pd.DataFrame.copy` method, you create a true
  independent copy. Changes to the original DataFrame or to the copy will not be
  reflected in the other.

In [32]:
print('Copying by reference:')
reference = dataframe

print('\tStarting value of dataframe:', dataframe['Jason'][1])
print('\tStarting value of reference:', reference['Jason'][1], end='\n\n')

dataframe.at[1, 'Jason'] = dataframe.at[1, 'Jason'] + 5

print('\tUpdated value of dataframe:', dataframe['Jason'][1])
print('\tUpdated value of reference:', reference['Jason'][1], end='\n\n')

print('Copying by value')
copy = dataframe.copy()

print('\tStarting value of dataframe:', dataframe['Jason'][1])
print('\tStarting value of copy:', copy['Jason'][1], end='\n\n')

dataframe.at[1, 'Jason'] = dataframe.at[1, 'Jason'] + 5

print('\tUpdated value of dataframe:', dataframe['Jason'][1])
print('\tUpdated value of copy:', copy['Jason'][1], end='\n\n')

Copying by reference:
	Starting value of dataframe: 79
	Starting value of reference: 79

	Updated value of dataframe: 84
	Updated value of reference: 84

Copying by value
	Starting value of dataframe: 84
	Starting value of copy: 84

	Updated value of dataframe: 89
	Updated value of copy: 84

