# Data Analysis
*Reference Notebook for Data Analysis in Python*

## Package Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Python Basics Review

- Strings
- Lists
- Dicts
- Date/Time
- Control Structures
- Functions
- Lambda Functions
- Classes

### Strings

In [49]:
# strings
my_string = 'This string is my string, it is not your string, and it has commas, so deal with it.'
print(my_string)

This string is my string, it is not your string, and it has commas, so deal with it.


In [50]:
# Repeat
print(my_string * 2)

This string is my string, it is not your string, and it has commas, so deal with it.This string is my string, it is not your string, and it has commas, so deal with it.


In [51]:
# Test for presence of substring
my_string_test_A = 'A' in my_string
my_string_test_i = 'is' in my_string

# Conversion
print('Is \'A\' in my_string? ' + str(my_string_test_A))
print('Is \'is\' in my_string? ' + str(my_string_test_i))

Is 'A' in my_string? False
Is 'is' in my_string? True


In [52]:
# Concatenation
your_string = my_string + ' Not anymore!'
print(your_string)

This string is my string, it is not your string, and it has commas, so deal with it. Not anymore!


In [53]:
# Zero-indexed (for slicing)
print(my_string[0])
short_string = my_string[:24]
print(short_string)

T
This string is my string


In [54]:
# Various functions
print(short_string.upper())
print(short_string.lower())
print(short_string.count('i'))
print(short_string.replace('i', 'u'))

THIS STRING IS MY STRING
this string is my string
4
Thus strung us my strung


In [55]:
# Formatting with variables
my_var = 'variables'
print('I can insert {} into strings.'.format(my_var))

I can insert variables into strings.


### Lists

In [90]:
# lists
users = ['val', 'bob', 'mia', 'ron', 'ned']
print(users)

['val', 'bob', 'mia', 'ron', 'ned']


In [92]:
# Get first element
first_user = users[0]
print(first_user)

# Get second element
second_user = users[1]
print(second_user)

# Get last element
last_user = users[-1]
print(last_user)

# First three
print(users[:3])

# Middle three
print(users[1:4])

# Last three
print(users[-3:])

val
bob
ned
['val', 'bob', 'mia']
['bob', 'mia', 'ron']
['mia', 'ron', 'ned']


In [70]:
# Modify
users[0] = 'valerie'
users[-2] = 'ronald'
print(users)

['valerie', 'bob', 'mia', 'ronald', 'ned']


In [71]:
# Append
users.append('amy')
print(users)

['valerie', 'bob', 'mia', 'ronald', 'ned', 'amy']


In [72]:
# Insert
users.insert(0, 'joe')
users.insert(3, 'bea')
print(users)

['joe', 'valerie', 'bob', 'bea', 'mia', 'ronald', 'ned', 'amy']


In [73]:
# Delete (by position)
del users[-1]
print(users)

# Delete (by value)
users.remove('mia')
print(users)

['joe', 'valerie', 'bob', 'bea', 'mia', 'ronald', 'ned']
['joe', 'valerie', 'bob', 'bea', 'ronald', 'ned']


In [74]:
# Popping
most_recent_user = users.pop()
print(most_recent_user)

first_user = users.pop(0)
print(first_user)

ned
joe


In [76]:
# Length
num_users = len(users)
print(users)
print('There are {} total users.'.format(num_users))

['valerie', 'bob', 'bea', 'ronald']
There are 4 total users.


In [80]:
# Sorting - modifying list
print(users)
users.sort()
print(users)
users.sort(reverse=True)
print(users)

# Sorting - copying list
print(sorted(users))

# Reverse
users.reverse()
print(users)

['valerie', 'ronald', 'bob', 'bea']
['bea', 'bob', 'ronald', 'valerie']
['valerie', 'ronald', 'bob', 'bea']
['bea', 'bob', 'ronald', 'valerie']
['bea', 'bob', 'ronald', 'valerie']


In [81]:
# Looping
for user in users:
    print(user)

bea
bob
ronald
valerie


In [87]:
# Range
for number in range(5):
    print(number)

print()    

for number in range(1, 6):
    print(number)
    
print()    

numbers = list(range(1, 11))
print(numbers)

0
1
2
3
4

1
2
3
4
5

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [94]:
# Comprehensions
squares = [x**2 for x in range(1, 11)]
print(squares)

names = ['kai', 'abe', 'ada', 'gus', 'zoe']
upper_names = [name.upper() for name in names]

print(names)
print(upper_names)

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
['kai', 'abe', 'ada', 'gus', 'zoe']
['KAI', 'ABE', 'ADA', 'GUS', 'ZOE']


In [66]:
# Copying
bikes = ['trek', 'redline', 'giant']

# NOT a copy - points to same object in memory
also_bikes = bikes

# Copies (point to new objects in memory)
new_bikes = bikes[:]
copy_bikes = bikes.copy() # Python 3

print('bikes: ' + str(bikes))
print('also_bikes: ' + str(also_bikes))
print('new_bikes: ' + str(new_bikes))
print('copy_bikes: ' + str(copy_bikes))

also_bikes.append('bmx')
print('bikes: ' + str(bikes))
print('also_bikes: ' + str(also_bikes))
print('new_bikes: ' + str(new_bikes))
print('copy_bikes: ' + str(copy_bikes))

bikes: ['trek', 'redline', 'giant']
also_bikes: ['trek', 'redline', 'giant']
new_bikes: ['trek', 'redline', 'giant']
copy_bikes: ['trek', 'redline', 'giant']
bikes: ['trek', 'redline', 'giant', 'bmx']
also_bikes: ['trek', 'redline', 'giant', 'bmx']
new_bikes: ['trek', 'redline', 'giant']
copy_bikes: ['trek', 'redline', 'giant']


## Data Wrangling

### Data Sources

- CSV
- Excel
- JSON
- Database (SQL)
- Database (NoSQL)
- Web (HTML)
- API

### Data Cleaning

- Column DataTypes
- Missing Values
- Replacing Values
- Duplicate Values
- Categorical Variabes
- Unit Conversion

### Data Manipulation

- Merging DataFrames
- Indexes
- Grouping
- Pivoting
- Melting
- Stacking
- Unstacking
- Filtering

## Data Visualization

- Pseudocolor Plots
- Scatterplots
- Distributions/Regressions
- ECDF
- Bar Plot
- Histogram
- Time-series Plot

## Inferential Statistics

- Summary Statistics (Mean, Median, Mode)
- Percentiles, outliers
- Variance, standard deviation
- Covariance, Pearson Correlation Coefficient
- Binomial Distribution
- Poisson Distribution
- PDF
- CDF
- Linear Regression
- Confidence Intervals
- Hypothesis Testing
- Statistical Power