# Pandas and Numpy Features for Data Cleaning and Manipulation
### This notebook will give an introduction to manipulating data with Pandas and Numpy

In [2]:
#Importing and naming the packages
import pandas as pd
import numpy as np

# Lists

In [None]:
#Lists - an ordered series of data (Integer, Float, Objects, etc.)
a = ['Hello','World']
print(a)

#### Lists can be changed via native functions to Python.

In [None]:
#Via appending to the end
a.append('!')
print(a)

In [None]:
#Or extending multiple items.
a.extend(['I','Am','A','List','.'])
print(a)

In [None]:
#You can also remove an item
a.remove('World')
print(a)

In [None]:
#Or put one back in
a.insert(1,'Hello')
print(a)

In [None]:
#Count a number of entries
a.count('Hello')

In [None]:
#Organize by Value
a.sort()
print(a)

In [None]:
#Switch the order around
a.reverse()
print(a)

In [None]:
#Grab a single item
a.pop(0)

In [None]:
#Or clear the entire thing
a.clear()
print(a)

### Practice
#### Create an empty list. Add a set of five numbers to it. Then, iterate through that list and multiply each number by 2. Finall sort the values and print them.

In [None]:
a = []
a.extend([1,2,3,4,5])
for i in range(len(a)): a[i]=a[i]*2
a.sort()
print(a)

# Numpy
### Numpy arrays are similar. They are also an ordered series of items.

In [None]:
#They can be created directly from Lists
a = np.array([1, 2, 3, 4, 5])
print(a)

In [None]:
#And have multiple dimensions
a = np.array([[1,2],[3,4]])
print(a)

In [5]:
#And have many more features
#Creating zeros
a = np.zeros(10, dtype=float)
print(a,': zeros\n')

#Creating ones
b = np.ones((3, 3))
print(b,': ones\n')

#For any other number
c = np.full((3, 3), 2.92)
print(c,': 2.92s\n')

#For a range of numbers
d = np.arange(1, 11)
print(d,': 1 - 11\n')

#For evenly spaced between two numbers
e = np.linspace(0, np.pi*2, np.pi)
print(e,': linearly spaced\n')

#For Random Numbers
np.random.seed(123)
f = np.random.random(5)
print(f,': random numbers\n')

#For Normal Random
g = np.random.randn(5)
print(g,': normal random\n')

#Random integers between 0 and 10
h = np.random.randint(0, 10, size=5)
print(h,': integer random')

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] : zeros

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]] : ones

[[2.92 2.92 2.92]
 [2.92 2.92 2.92]
 [2.92 2.92 2.92]] : 2.92s

[ 1  2  3  4  5  6  7  8  9 10] : 1 - 11

[0.         3.14159265 6.28318531] : linearly spaced

[0.69646919 0.28613933 0.22685145 0.55131477 0.71946897] : random numbers

[ 0.32210607 -0.05151772 -0.20420096  1.97934843 -1.61930007] : normal random

[9 3 4 0 0] : integer random




In [None]:
#You can also apply computations over entire arrays
a = np.array([1, 2, 3, 4, 5])
b = np.sin(a*np.pi/2)
print(b)

### Practice
#### Create an array of the numbers  1 - 100, apply a modulus of 3 to the numbers. Print the results

In [None]:
print(np.arange(1,100)%3)

# Pandas
### Pandas also has its own approach to list-like data with a label (automatically or manually applied) and a value

In [None]:
#Series from List
a = pd.Series([1, 3, 5, 7, 10])
print(a)

#Series of Single Value
b = pd.Series(1, index=['A', 'B', 'C'])
print(b)

#Series from Dictionary
c = pd.Series({1: 'A', 2: 'B', 3: 'C'})
print(c)

#Series of Zeros
d = pd.Series(np.random.randint(10, size=5))
print(d)

In [None]:
#Series can be calculated over with looping
b = a*2
print(b)

c = b%3
print(c)

In [None]:
#And also allow for quick checking of the items in the list
a = pd.Series({'A':1,'B':2})
print('C' in a)
print('A' in a)

### More often though, your data will work better as a DataFrame. This is essentially the way Pandas represents tabular data, such as you would see in SQL or Excel.

In [None]:
#DataFrame from Dictionary
df = pd.DataFrame([{'A': i, 'B': 2*i} for i in range(3)]) 
print(df)

#DataFrame from Series
population_series = pd.Series({'California': 38332521,
                        'Texas': 26448193,
                        'New York': 19651127,
                        'Florida': 19552860})
population_df = pd.DataFrame(population_series, columns=['Population'])
print(population_df)

#DataFrame from Dictionary
area_series = pd.Series({'California': 423967,
                  'Texas': 695662,
                  'New York': 141297,
                  'Florida': 170312,
                  'Illinois': 149995})
area_df = pd.DataFrame({'Population': population_series,
                  'area': area_series})
print(area_df)

In [None]:
#You can also very quickly create new columns and formulas
df['C'] = df['B']*2
print(df)

In [None]:
#As well as dropping them (Axis = 1 means columns, Axis = 2 means rows)
df = df.drop('C', axis=1)
print(df)

### Through these main features of Pandas, you can create nearly anything you want. Lets work with some larger data

Data can be imported by reading a CSV or just about any filetype. For example:

*data = read_csv('~/Downloads/data.csv')*

This would read a CSV of name data.csv from your downloads folder on a Mac.

In our case, we will import some data from another package.

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
data = iris.data
feature_names = iris.feature_names

data = pd.DataFrame(data,columns=feature_names)
data.head(5)

### Let's learn a little more about this data

In [None]:
#This will give quick summary statistics about our columns
data.describe()

### Let's work on selecting data out of these 150 points

In [None]:
#Grabbing a row. Syntax: 'From:To(:Step)' Steps optional, with empty values being non-limited
row = data[:1]
print(row)

rows = data[0:5]
print("\n",rows)

all_rows = data[0:]
print("\n",len(all_rows))

half_rows = data[0::2]
print("\n",len(half_rows))

In [None]:
#More complicated operations can also be performed
reverse_rows = data[::-1]
print(reverse_rows.head(5))
print(all_rows.tail(5))

### Data can also be treated more generally by using loc and iloc methods.
### First, in a Series

In [None]:
#Explicit Indexing
a = pd.Series([0.25, 0.5, 0.75, 1.0],
            index=['A', 'B', 'C', 'D'])
print(a)
print(a['A'])
print(a.loc['B'])

In [None]:
#Implicit Indexing
a = pd.Series([0.25, 0.5, 0.75, 1.0])
print(a)
print(a[0])
print(a.iloc[1])

### Now in a DataFrame

In [None]:
#Choosing Data using loc, using the same Syntax as above.
a = pd.DataFrame(np.random.randint(10, size=10),
                index=[1,3,5,9,11,13,15,17,19,21])

#Using Explicit Index
print("Explicit\n",a.loc[:5])

#Using Implicit Index
print("\nImplicit\n",a.iloc[:5])

### How about selecting data in higher dimensions?

In [None]:
#For the Iris data
rows = data.iloc[:5]
print('First five rows:\n',rows,'\n')

col_rows = data.iloc[:5,:2]
print('First five rows, first two columns:\n',col_rows)

In [None]:
#Getting Exact Cells
cell = data.iloc[2,2]
print('Second row, second column:\n',cell)

### So how can you change this data?
#### Numpy

In [None]:
#Numpy
a = np.random.randint(20, size=(4,3))
print(a)

In [None]:
#Numpy links back to the original when you subset your data - this will change the original
a_part = a[:2,:2]
a_part[0,0] = 2
print(a)

#### Pandas

In [None]:
#Pandas does not. This simply creates a new view of the data, a copy that will not change the original
a = pd.DataFrame(a)
a_part = a.iloc[:2,:2]
a_part[0,0] = 4
print(a)

In [None]:
#But you can still explicitly go about changing these values by avoiding subsetting the data
a.iloc[0,0] = 4
print(a)

### How then can you check the shape of your data to help locate points?

In [None]:
#Number of dimensions
print("Iris Dimensions: ", data.ndim,"\n")

#(Rows,Columns)
print("Iris Shape: ", data.shape,"\n")

#Number of Elements
print("Iris Size: ", data.size,"\n")

#Data Types
print("Iris Types:\n", data.dtypes)

### Lastly, you can manipulate your data by adding Boolean restrictions.

#### Say you want to know how many of the petals are above average for length.

In [None]:
#Start by grabbing out the data
lengths = pd.DataFrame(data.iloc[:,2])
lengths.head(5)
print(lengths.iloc[0])

In [None]:
#Instead of:
above = 0
mean = float(lengths.mean(0))
for i in range(len(lengths)):
    if float(lengths.iloc[i]) > mean:
        above += 1
above

In [None]:
#You can:
data[data['petal length (cm)'] > data['petal length (cm)'].mean()].shape[0]

In [None]:
#And then you can add more restrictions:
data[(data['petal length (cm)'] > 1) & (data['petal length (cm)'] < 2)].shape[0]

## Some Practice:
### Tell me how many observations have above average petal length and width.

### Tell me what the average petal length is for those observations with petal width above average.