In [12]:
import numpy as np
import pandas as pd

# Pandas and Numpy Introduction

Finished through the introduction to Python, logic, and programming! Now, we're getting into the Pandas and Numpy package world.

## Numpy and Pandas

Numpy and Pandas are the two biggest packages out there for data science.

### Numpy Arrays

In [None]:
list1 = [1, 2, 3, 4]

In [14]:
array1 = np.array(list1)
print(array1)

[1 2 3 4]


Somtimes we want a two dimensional array:

In [None]:
list2 = [[1, 2, 3], [4, 5, 6]]
array2 = np.array(list2)
print(array2)

[[1 2 3]
 [4 5 6]]


NumPy makes it easy to perform all kinds of operations on these arrays:

In [None]:
toyPrices = np.array([5, 8, 3, 6])
print(toyPrices - 2)

[3 6 1 4]


### Pandas `series`

This is a data structure that is commonly used in Pandas and has some special considerations found at [the API reference](https://pandas.pydata.org/docs/reference/api/pandas.Series.html).

In [None]:
# Create a Series using a NumPy array of ages with the default numerical indices
ages = np.array([13, 25, 19])
series1 = pd.Series(ages)
print(series1)

0    13
1    25
2    19
dtype: int64


In [None]:
# Create a Series using a NumPy array of ages but customize the indices to be the names that correspond to each age
ages = np.array([13, 25, 19])
series1 = pd.Series(ages, index=['Emma', 'Swetha', 'Serajh'])
print(series1)

Emma      13
Swetha    25
Serajh    19
dtype: int64


### Pandas `DataFrame`

From Codeacademy:

> Another important type of object in the pandas library is the DataFrame. This object is similar in form to a matrix as it consists of rows and columns. Both rows and columns can be indexed with integers or String names. One DataFrame can contain many different types of data types, but within a column, everything has to be the same data type. A column of a DataFrame is essentially a Series. All columns must have the same number of elements (rows).

The really key part here: we can make the `dataframe` from all kinds of sources. Most likely, I would be dealing with CSVs and SQL-like databases.

In [None]:
dataf = pd.DataFrame([
    ['John Smith', '123 Main St', 34],
    ['Jane Doe', '456 Maple Ave', 28],
    ['Joe Schmo', '789 Broadway', 51]
],
    columns=['name', 'address', 'age'])

In [20]:
print(dataf)

         name        address  age
0  John Smith    123 Main St   34
1    Jane Doe  456 Maple Ave   28
2   Joe Schmo   789 Broadway   51


In [21]:
dataf.set_index('name')

Unnamed: 0_level_0,address,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1
John Smith,123 Main St,34
Jane Doe,456 Maple Ave,28
Joe Schmo,789 Broadway,51


In [22]:
dataf.set_index('age')

Unnamed: 0_level_0,name,address
age,Unnamed: 1_level_1,Unnamed: 2_level_1
34,John Smith,123 Main St
28,Jane Doe,456 Maple Ave
51,Joe Schmo,789 Broadway


## Resource

A great resource for checking out some interesting parts of [pandas](https://pandas.pydata.org/docs/user_guide/10min.html).

## Summary

In [None]:
import pandas as pd

df = pd.DataFrame([
    ['January', 100, 100, 23, 100],
    ['February', 51, 45, 145, 45],
    ['March', 81, 96, 65, 96],
    ['April', 80, 80, 54, 180],
    ['May', 51, 54, 54, 154],
    ['June', 112, 109, 79, 129]],
    columns=['month', 'clinic_east',
             'clinic_north', 'clinic_south',
             'clinic_west']
)

df2 = df.loc[[1, 3, 5]]

print(df2)

df3 = df2.reset_index()

df2.reset_index(drop=True, inplace=True)
print(df2)
print(df3)

      month  clinic_east  clinic_north  clinic_south  clinic_west
1  February           51            45           145           45
3     April           80            80            54          180
5      June          112           109            79          129
      month  clinic_east  clinic_north  clinic_south  clinic_west
0  February           51            45           145           45
1     April           80            80            54          180
2      June          112           109            79          129
   index     month  clinic_east  clinic_north  clinic_south  clinic_west
0      1  February           51            45           145           45
1      3     April           80            80            54          180
2      5      June          112           109            79          129


In [25]:
import pandas as pd

orders = pd.read_csv('shoefly.csv')
print(orders.head())

emails = orders.email
frances_palmer = orders[(orders.first_name == 'Frances')
                        & (orders.last_name == 'Palmer')]

comfy_shoes = orders[(orders.shoe_type == 'clogs') | (
    orders.shoe_type == 'boots') | (orders.shoe_type == 'ballet flats')]

comfy_shoes = orders[orders.shoe_type.isin(['clogs', 'boots', 'ballet_flats'])]

      id first_name last_name                         email     shoe_type  \
0  54791    Rebecca   Lindsay  RebeccaLindsay57@hotmail.com         clogs   
1  53450      Emily     Joyce        EmilyJoyce25@gmail.com  ballet flats   
2  91987      Joyce    Waller        Joyce.Waller@gmail.com       sandals   
3  14437     Justin  Erickson   Justin.Erickson@outlook.com         clogs   
4  79357     Andrew     Banks              AB4318@gmail.com         boots   

  shoe_material shoe_color  
0  faux-leather      black  
1  faux-leather       navy  
2        fabric      black  
3  faux-leather        red  
4       leather      brown  
