# Data Analysis
*Reference Notebook for Data Analysis in Python*

## Package Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Wrangling

### Data Sources

- CSV
- Excel
- JSON
- Database (SQL)
- Database (NoSQL)
- Web (HTML)
- API
- Python Dictionary
- Python List
- NumPy Array
- Pandas Series

In [2]:
# Filenames
csv_filename = 'data/2020_CFS.csv'
excel_filename = 'data/2020_CFS.xlsx'
json_filename = 'data/2020_CFS.json'

In [3]:
# Importing CSV
data = pd.read_csv(
    filepath_or_buffer=csv_filename,
    sep=',',
    header=0,
    names=None,  # array-like for column names if no headers in data
    index_col=None,
    usecols=None,  # list-like or callable to return subset of columns
    dtype=None,  # type-name or dict of column: type
    skiprows=None,
    nrows=None,
    na_values=None,  # additional strings to recognize as NA/NaN
    error_bad_lines=True,  # if False, drop malformed lines
    warn_bad_lines=True,  # if True and error_bad_lines False, show warning for each malformed line
)
data.head()

Unnamed: 0,NOPD_Item,Type,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0000120,94F,FIREWORKS,1A,103,DISTURBANCE (OTHER),1C,3677228,550814,01/01/2020 12:00:34 AM,...,,01/01/2020 06:53:08 AM,NAT,Necessary Action Taken,N,3N01,001XX Blk Riviera Ave,70122,3,POINT (-90.0808922 30.0086791)
1,A0000220,21,COMPLAINT OTHER,1J,21,COMPLAINT OTHER,1J,3668710,533007,01/01/2020 12:00:42 AM,...,01/01/2020 12:00:42 AM,01/01/2020 01:37:16 AM,NAT,Necessary Action Taken,Y,2U04,034XX Broadway St,70125,2,POINT (-90.10840522 29.95996774)
2,A0000320,94F,FIREWORKS,1A,94F,FIREWORKS,2J,3674930,533982,01/01/2020 12:01:05 AM,...,01/01/2020 02:08:17 AM,01/01/2020 02:34:36 AM,NAT,Necessary Action Taken,N,1H02,026XX Banks St,70119,1,POINT (-90.08872937 29.96246347)
3,A0000420,94,DISCHARGING FIREARM,2D,94,DISCHARGING FIREARM,2D,3681805,536653,01/01/2020 12:02:50 AM,...,01/01/2020 12:09:13 AM,01/01/2020 12:13:45 AM,GOA,GONE ON ARRIVAL,N,1A01,Kerlerec St & N Robertson St,70116,1,POINT (-90.0669267 29.96960271)
4,A0000520,94F,FIREWORKS,1A,94F,FIREWORKS,2J,3668697,542174,01/01/2020 12:03:46 AM,...,,01/01/2020 12:42:13 AM,NAT,Necessary Action Taken,N,3I01,053XX Memphis St,70124,3,POINT (-90.10813674 29.98517428)


In [29]:
# Importing Excel
# requires xldr package
data = pd.read_excel(
    io=excel_filename,
    sheet_name=0,  # str, int, list, or None (read all sheets)
)
data.head()

Unnamed: 0,NOPD_Item,Type,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0000120,94F,FIREWORKS,1A,103,DISTURBANCE (OTHER),1C,3677228,550814,2020-01-01 00:00:34,...,NaT,2020-01-01 06:53:08,NAT,Necessary Action Taken,N,3N01,001XX Blk Riviera Ave,70122,3,POINT (-90.0808922 30.0086791)
1,A0000220,21,COMPLAINT OTHER,1J,21,COMPLAINT OTHER,1J,3668710,533007,2020-01-01 00:00:42,...,2020-01-01 00:00:42,2020-01-01 01:37:16,NAT,Necessary Action Taken,Y,2U04,034XX Broadway St,70125,2,POINT (-90.10840522 29.95996774)
2,A0000320,94F,FIREWORKS,1A,94F,FIREWORKS,2J,3674930,533982,2020-01-01 00:01:05,...,2020-01-01 02:08:17,2020-01-01 02:34:36,NAT,Necessary Action Taken,N,1H02,026XX Banks St,70119,1,POINT (-90.08872937 29.96246347)
3,A0000420,94,DISCHARGING FIREARM,2D,94,DISCHARGING FIREARM,2D,3681805,536653,2020-01-01 00:02:50,...,2020-01-01 00:09:13,2020-01-01 00:13:45,GOA,GONE ON ARRIVAL,N,1A01,Kerlerec St & N Robertson St,70116,1,POINT (-90.0669267 29.96960271)
4,A0000520,94F,FIREWORKS,1A,94F,FIREWORKS,2J,3668697,542174,2020-01-01 00:03:46,...,NaT,2020-01-01 00:42:13,NAT,Necessary Action Taken,N,3I01,053XX Memphis St,70124,3,POINT (-90.10813674 29.98517428)


In [5]:
# Importing JSON
data = pd.read_json(json_filename)
data.head()

Unnamed: 0,NOPD_Item,Type,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0000120,94F,FIREWORKS,1A,103,DISTURBANCE (OTHER),1C,3677228,550814,01/01/2020 12:00:34 AM,...,,01/01/2020 06:53:08 AM,NAT,Necessary Action Taken,N,3N01,001XX Blk Riviera Ave,70122,3,POINT (-90.0808922 30.0086791)
1,A0000220,21,COMPLAINT OTHER,1J,21,COMPLAINT OTHER,1J,3668710,533007,01/01/2020 12:00:42 AM,...,01/01/2020 12:00:42 AM,01/01/2020 01:37:16 AM,NAT,Necessary Action Taken,Y,2U04,034XX Broadway St,70125,2,POINT (-90.10840522 29.95996774)
2,A0000320,94F,FIREWORKS,1A,94F,FIREWORKS,2J,3674930,533982,01/01/2020 12:01:05 AM,...,01/01/2020 02:08:17 AM,01/01/2020 02:34:36 AM,NAT,Necessary Action Taken,N,1H02,026XX Banks St,70119,1,POINT (-90.08872937 29.96246347)
3,A0000420,94,DISCHARGING FIREARM,2D,94,DISCHARGING FIREARM,2D,3681805,536653,01/01/2020 12:02:50 AM,...,01/01/2020 12:09:13 AM,01/01/2020 12:13:45 AM,GOA,GONE ON ARRIVAL,N,1A01,Kerlerec St & N Robertson St,70116,1,POINT (-90.0669267 29.96960271)
4,A0000520,94F,FIREWORKS,1A,94F,FIREWORKS,2J,3668697,542174,01/01/2020 12:03:46 AM,...,,01/01/2020 12:42:13 AM,NAT,Necessary Action Taken,N,3I01,053XX Memphis St,70124,3,POINT (-90.10813674 29.98517428)


### Data Cleaning

- Column DataTypes
- Missing Values
- Replacing Values
- Duplicate Values
- Categorical Variabes
- Unit Conversion

### Data Manipulation

- Merging DataFrames
- Indexes
- Grouping
- Pivoting
- Melting
- Stacking
- Unstacking
- Filtering

## Data Visualization

- Pseudocolor Plots
- Scatterplots
- Distributions/Regressions
- ECDF
- Bar Plot
- Histogram
- Time-series Plot

## Inferential Statistics

- Summary Statistics (Mean, Median, Mode)
- Percentiles, outliers
- Variance, standard deviation
- Covariance, Pearson Correlation Coefficient
- Binomial Distribution
- Poisson Distribution
- PDF
- CDF
- Linear Regression
- Confidence Intervals
- Hypothesis Testing
- Statistical Power