# ENVIRONMENT

In [4]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2, whiten
from sklearn.cluster import KMeans
from sklearn import metrics
import math
import acquire

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

# Modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# ACQUIRE

In [5]:
df1 = acquire.read_data('data08.csv')
df2 = acquire.read_data('data10.csv')
df3 = acquire.join_data('data08.csv', 'data10.csv')

# PREPARE

In [9]:
def missing_values_col(df):
    """
    Write or use a previously written function to return the
    total missing values and the percent missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    return pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})


def missing_values_row(df):
    """
    Write or use a previously written function to return the
    total missing values and the percent missing values by row.
    """
    null_count = df.isnull().sum(axis=1)
    null_percentage = (null_count / df.shape[1]) * 100
    return pd.DataFrame({'num_missing': null_count, 'percentage': null_percentage})


def handle_missing_threshold(df, prop_required_column = .3, prop_required_row = .9):
    """
    Removes columns and rows whose count of missing values exceeds threshold.
    """
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def count_vals(column):
    return df[column].value_counts(dropna=False)

def summarize_data(df):
    
    df_head = df.head()
    print(f'HEAD\n{df_head}', end='\n\n')
   
    df_tail = df.tail()
    print(f'TAIL\n{df_tail}', end='\n\n')

    shape_tuple = df.shape
    print(f'SHAPE: {shape_tuple}', end='\n\n')
    
    df_describe = df.describe()
    print(f'DESCRIPTION\n{df_describe}', end='\n\n')
    
    df.info()
    print(f'INFORMATION')    

    print(f'VALUE COUNTS', end='\n\n')
    for col in df.columns:
        n = df[col].unique().shape[0]
        col_bins = min(n, 10)
        print(f'{col}:')
        if df[col].dtype in ['int64', 'float64'] and n > 10:
            print(df[col].value_counts(bins=col_bins, sort=False, dropna=False))
        else:
            print(df[col].value_counts(dropna=False))
        print('\n')


def plot_hist(df):
    """
    Plots the distribution of the dataframe's variables.
    """
    df.hist(figsize=(24, 20), bins=20)

In [10]:
df = df2.filter(['INCIDENT',
                 'FORCED',
                 'USOBJ',
                 'THREAT',
                 'SLAP',
                 'PUNCH',
                 'BEATING',
                 'TWEAPON',
                 'UWEAPON',
                 'FSEXONLY',
                 'MISCARG',
                 'RESTRAIN',
                 'CHOKED',
                 'RDRUNK',
                 'NDRUNK',
                 'BOTHDRUN',
                 'POT',
                 'RPOT',
                 'NPOT',
                 'DRUGS',
                 'RDRUGS',
                 'NDRUGS',
                 'BOTHDRUG',
                 'SUBSTANC',
                 'SEVEREST',
                ], axis=1)

In [11]:
df.shape

(4978, 25)

In [12]:
summarize_data(df)

HEAD
   INCIDENT  FORCED  USOBJ  THREAT  SLAP  PUNCH  BEATING  TWEAPON  UWEAPON  \
0         2       0      2       0     0      1        0        0        0   
1         1       0      2       0     1      0        0        0        0   
2         4       0      1       0     0      0        0        1        0   
3         1       0      2       0     1      0        0        0        0   
4         3       0      2       0     0      0        1        0        0   

   FSEXONLY    ...     BOTHDRUN  POT  RPOT  NPOT  DRUGS  RDRUGS  NDRUGS  \
0         0    ...            0    0     0     0      0       0       0   
1         0    ...            0    0     0     0      0       0       0   
2         0    ...            0    0     0     0      0       0       0   
3         0    ...            0    0     0     0      0       0       0   
4         0    ...            0    0     0     0      0       0       0   

   BOTHDRUG  SUBSTANC  SEVEREST  
0         0         1         2  
1      