# Guns in USA

https://www.atf.gov/resource-center/data-statistics

In [2]:
import pandas as pd
import numpy as np
import sidetable

<font color= "cian"> data from: https://fivethirtyeight.com/features/gun-deaths-about/ 

<font color= "cian"> Charging the data

In [3]:
df = pd.read_csv("full_data.csv", index_col = 0)

<font color= "cian"> Exploring the data: I want to explore the shape of our dataframe(df), unique values, typo errors, types of data...

In [4]:
df.head()

Unnamed: 0,year,month,intent,police,sex,age,race,hispanic,place,education
1,2012,1,Suicide,0,M,34.0,Asian/Pacific Islander,100,Home,BA+
2,2012,1,Suicide,0,F,21.0,White,100,Street,Some college
3,2012,1,Suicide,0,M,60.0,White,100,Other specified,BA+
4,2012,2,Suicide,0,M,64.0,White,100,Home,BA+
5,2012,2,Suicide,0,M,31.0,White,100,Other specified,HS/GED


In [5]:
df.shape

(100798, 10)

In [6]:
df.dtypes

year           int64
month          int64
intent        object
police         int64
sex           object
age          float64
race          object
hispanic       int64
place         object
education     object
dtype: object

<font color= "cian"> All the types look correctly.

<font color= "cian"> Exploring nulls

In [7]:
percent_missing = pd.DataFrame(df.isna().sum()*100 / len(df))
percent_missing

Unnamed: 0,0
year,0.0
month,0.0
intent,0.000992
police,0.0
sex,0.0
age,0.017857
race,0.0
hispanic,0.0
place,1.373043
education,1.410742


<font color= "cian">

<font color= "cian"> I assume that this 1% of nulls is not so much, then I will eliminate them.

In [8]:
df.dropna(inplace= True)

<font color= "cian"> Are there duplicates?

In [9]:
df.duplicated().sum()*100/len(df)

39.190940162220066

<font color= "cian"> We have many duplicates but is logical because of the type our data. I decide to maintain them. 

<font color= "cian"> The columns of our dataset

In [10]:
df.columns

Index(['year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic',
       'place', 'education'],
      dtype='object')

<font color= "cian"> Different categories or types of gun deaths (unique values)

In [17]:
for col in df.columns:
    print("\n In the variable ----------> ", col,"----- > These are the unique values:  ", df[col].unique())


 In the variable ---------->  year ----- > These are the unique values:   [2012 2013 2014]

 In the variable ---------->  month ----- > These are the unique values:   [ 1  2  3  4  5  6  7  8  9 10 11 12]

 In the variable ---------->  intent ----- > These are the unique values:   ['Suicide' 'Undetermined' 'Accidental' 'Homicide']

 In the variable ---------->  police ----- > These are the unique values:   [0 1]

 In the variable ---------->  sex ----- > These are the unique values:   ['M' 'F']

 In the variable ---------->  age ----- > These are the unique values:   [ 34.  21.  60.  64.  31.  17.  48.  41.  50.  30.  43.  27.  55.  53.
  51.  65.  52.  47.  19.  22.  56.  38.  25.  26.  23.  87.  28.  39.
  59.  61.  12.  40.  57.  20.  62.  71.  44.  29.  24.   9.  77.  18.
  36.  49.  13.  33.  90.  63.  46.  32.  84.  68.  89.  58.  78.  66.
  81.  54.  45.  37.  35.  42.  67.  69.  72.  82.  75.  73.  15.  83.
  74.  86.  16.  88.  80.  79.  91.  76.  14.  95.  96.  70.   7.  85.

<font color= "red"> Investigate what is each column and their values. Hispanic?

<font color= "cian"> Different categories or types of gun deaths

In [18]:
df.describe()

Unnamed: 0,year,month,police,age,hispanic
count,98015.0,98015.0,98015.0,98015.0,98015.0
mean,2013.000561,6.56778,0.000194,43.900209,112.42065
std,0.816136,3.406984,0.013922,19.522908,49.320698
min,2012.0,1.0,0.0,0.0,100.0
25%,2012.0,4.0,0.0,27.0,100.0
50%,2013.0,7.0,0.0,42.0,100.0
75%,2014.0,9.0,0.0,58.0,100.0
max,2014.0,12.0,1.0,107.0,998.0


<font color= "cian"> Here we see the minimum value in `age` is 1, an ilogical value. We have to solve this. 

In [12]:
df.describe(include="object")

Unnamed: 0,intent,sex,race,place,education
count,98015,98015,98015,98015,98015
unique,4,2,5,10,4
top,Suicide,M,White,Home,HS/GED
freq,62291,83835,64598,59622,42258


<font color= "cian"> How many gun deaths are there by race?

In [13]:
df["race"].value_counts()

White                             64598
Black                             22675
Hispanic                           8603
Asian/Pacific Islander             1261
Native American/Native Alaskan      878
Name: race, dtype: int64

In [14]:
df.stb.freq(["race"])

Unnamed: 0,race,count,percent,cumulative_count,cumulative_percent
0,White,64598,65.906239,64598,65.906239
1,Black,22675,23.134214,87273,89.040453
2,Hispanic,8603,8.777228,95876,97.817681
3,Asian/Pacific Islander,1261,1.286538,97137,99.104219
4,Native American/Native Alaskan,878,0.895781,98015,100.0


<font color= "green"> Mean age by race

In [15]:
mean_race = round(df.groupby(["race"])["age"].mean(),2)
mean_race

race
Asian/Pacific Islander            38.61
Black                             31.06
Hispanic                          33.14
Native American/Native Alaskan    36.49
White                             50.04
Name: age, dtype: float64

<font color= "cian"> Trying colors