## Clean and normalize "crime.csv" dataset

> Review through data and clean the set at the basic level before merging

### Import some useful libraries

In [1]:
import pandas as pd
import numpy as np
print("import completed")

import completed


### Import the data from csv file

In [2]:
df = pd.read_csv(r"../data/Crime.csv")

### Review data 

**1. Review first ten rows**

In [3]:
print(df.head(10))

                      MSA ViolentCrime  Murder   Rape  Robbery  \
0      Abilene, TX M.S.A.        412.5     5.3   56.0     78.4   
1        Akron, OH M.S.A.        238.4     5.1   38.2     75.2   
2       Albany, GA M.S.A.        667.9     7.8   30.4    157.9   
3       Albany, OR M.S.A.        114.3     2.5   28.2     20.7   
4  Albuquerque, NM M.S.A.        792.6     6.1   63.8    206.7   
5   Alexandria, LA M.S.A.        936.4     4.5   35.5    120.1   
6      Altoona, PA M.S.A.        216.5     0.8   28.7     25.5   
7     Amarillo, TX M.S.A.        538.7     3.4   72.0    107.1   
8         Ames, IA M.S.A.        158.5     1.0   47.2     19.9   
9    Anchorage, AK M.S.A.      1,039.5     8.2  165.6    199.1   

   AggravatedAssault PropertyCrime Burglary    Theft  MotorVehicleTheft State  \
0              272.8       3,609.0    852.0  2,493.6              263.4    TX   
1              119.8       2,552.4    575.3  1,853.0              124.1    OH   
2              471.8       3,8

**2. Review numeric columns**

In [4]:
print(df.shape)
print(df.describe())

(378, 12)
           Murder        Rape     Robbery  AggravatedAssault  \
count  378.000000  378.000000  378.000000         377.000000   
mean     4.574868   41.306614   77.702646         241.319098   
std      3.383652   19.506386   53.977181         137.267225   
min      0.000000    2.700000    2.300000          25.500000   
25%      2.200000   29.000000   36.625000         144.200000   
50%      3.800000   37.550000   67.350000         215.700000   
75%      6.275000   51.550000  106.050000         304.900000   
max     20.600000  165.600000  351.900000         981.300000   

       MotorVehicleTheft  
count         378.000000  
mean          191.081746  
std           134.532918  
min            15.700000  
25%            97.550000  
50%           154.450000  
75%           249.650000  
max           718.600000  


**3. Review basic info for all columns**

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MSA                378 non-null    object 
 1   ViolentCrime       377 non-null    object 
 2   Murder             378 non-null    float64
 3   Rape               378 non-null    float64
 4   Robbery            378 non-null    float64
 5   AggravatedAssault  377 non-null    float64
 6   PropertyCrime      372 non-null    object 
 7   Burglary           374 non-null    object 
 8   Theft              375 non-null    object 
 9   MotorVehicleTheft  378 non-null    float64
 10  State              378 non-null    object 
 11  City               373 non-null    object 
dtypes: float64(5), object(7)
memory usage: 35.6+ KB
None


**4. check for duplication**

In [6]:
dup = df.duplicated()
duprow = df[dup]
print(duprow)

Empty DataFrame
Columns: [MSA, ViolentCrime, Murder, Rape, Robbery, AggravatedAssault, PropertyCrime, Burglary, Theft, MotorVehicleTheft, State, City]
Index: []


**5. Check for missing values**

- find the number of null values in each column

In [7]:
print(df.isnull().sum())

MSA                  0
ViolentCrime         1
Murder               0
Rape                 0
Robbery              0
AggravatedAssault    1
PropertyCrime        6
Burglary             4
Theft                3
MotorVehicleTheft    0
State                0
City                 5
dtype: int64


- review columns with null data

In [8]:
dataNull = df.isnull()
rowNull = dataNull.any(axis = 1)
df[rowNull]

Unnamed: 0,MSA,ViolentCrime,Murder,Rape,Robbery,AggravatedAssault,PropertyCrime,Burglary,Theft,MotorVehicleTheft,State,City
21,"Barnstable Town, MA M.S.A.",518.7,1.4,49.4,28.9,439.1,1722.5,475.9,1170.7,75.9,MA,
37,"Boulder, CO M.S.A.3",,0.9,67.4,22.6,,2134.8,331.0,1689.0,114.7,CO,Boulder
91,"Denver-Aurora-Lakewood, CO M.S.A.3, 4",362.3,3.8,63.6,84.8,210.0,,,,388.3,CO,Denver
126,"Glens Falls, NY M.S.A.4",170.5,0.8,67.9,9.5,92.4,,173.7,,21.3,NY,Glens Falls
146,"Hinesville, GA M.S.A.",304.4,1.2,10.7,60.9,231.6,2270.3,490.6,1656.8,122.9,GA,
149,"Houston-The Woodlands-Sugar Land, TX M.S.A.3",566.6,6.9,37.3,228.9,293.5,,596.8,,367.2,TX,Houston
244,"Ogden-Clearfield, UT M.S.A.2, 3",157.6,2.3,52.1,25.5,77.6,,,1682.2,145.5,UT,Ogden
273,"Punta Gorda, FL M.S.A.",202.2,0.6,18.1,17.5,166.1,1509.5,242.4,1202.9,64.1,FL,
292,"Salt Lake City, UT M.S.A.",392.2,2.6,71.6,91.5,226.5,,,3574.9,565.2,UT,Salt Lake City
310,"Sebring, FL M.S.A.",341.7,5.1,38.5,56.8,241.3,2739.4,633.6,1994.2,111.5,FL,


- use online research to fill null data in "city" column manually (because there are only few rows that are need) 

In [9]:
df.iloc[21, 11] = "Barnstable"
df.iloc[146, 11] = "Hinesville"
df.iloc[273, 11] = "Punta Gorda"
df.iloc[310, 11] = "Sebring"
df.iloc[311, 11] = "Sheboygan"
df.iloc[91, 6] = 2800.6
df.iloc[91, 7] = 449.45
df.iloc[91, 8] = 1973.45
df.iloc[149, 6] = 3065.95
df.iloc[149, 8] = 2083.85

**6. Review "State" column**

- all unique values match with data in "two-letter states abbreviations"

In [10]:
print(df["State"].unique())

['TX' 'OH' 'GA' 'OR' 'NM' 'LA' 'PA' 'IA' 'AK' 'MI' 'AL' 'WI' 'NC' 'NJ'
 'CA' 'MD' 'ME' 'MA' 'WV' 'WA' 'MT' 'NY' 'ND' 'IL' 'IN' 'ID' 'CO' 'KY'
 'CT' 'VT' 'FL' 'MO' 'NV' 'WY' 'SC' 'VA' 'TN' 'DE' 'MN' 'OK' 'AZ' 'AR'
 'NE' 'MS' 'HI' 'UT' 'NH' 'KS' 'RI' 'SD' 'DC'
 'Aguadilla-Isabela, Puerto Rico M.S.A.' 'Arecibo, Puerto Rico M.S.A.'
 'Guayama, Puerto Rico M.S.A.' 'Mayaguez, Puerto Rico M.S.A.'
 'Ponce, Puerto Rico M.S.A.' 'San German, Puerto Rico M.S.A.'
 'San Juan-Carolina-Caguas, Puerto Rico M.S.A.']


In [11]:
df

Unnamed: 0,MSA,ViolentCrime,Murder,Rape,Robbery,AggravatedAssault,PropertyCrime,Burglary,Theft,MotorVehicleTheft,State,City
0,"Abilene, TX M.S.A.",412.5,5.3,56.0,78.4,272.8,3609.0,852.0,2493.6,263.4,TX,Abilene
1,"Akron, OH M.S.A.",238.4,5.1,38.2,75.2,119.8,2552.4,575.3,1853.0,124.1,OH,Akron
2,"Albany, GA M.S.A.",667.9,7.8,30.4,157.9,471.8,3894.1,1099.6,2652.8,141.7,GA,Albany
3,"Albany, OR M.S.A.",114.3,2.5,28.2,20.7,63.0,3208.4,484.6,2476.1,247.7,OR,Albany
4,"Albuquerque, NM M.S.A.",792.6,6.1,63.8,206.7,516.0,4607.8,883.4,3047.6,676.9,NM,Albuquerque
...,...,...,...,...,...,...,...,...,...,...,...,...
373,"Guayama, Puerto Rico M.S.A.",251.6,11.4,6.3,74.6,159.3,823.2,265.5,531.1,26.6,"Guayama, Puerto Rico M.S.A.",Guayama
374,"Mayaguez, Puerto Rico M.S.A.",237.5,11.5,5.2,82.3,138.6,1320.0,377.1,861.6,81.3,"Mayaguez, Puerto Rico M.S.A.",Mayaguez
375,"Ponce, Puerto Rico M.S.A.",231.4,18.0,5.0,66.2,142.2,885.0,214.4,632.4,38.1,"Ponce, Puerto Rico M.S.A.",Ponce
376,"San German, Puerto Rico M.S.A.",92.1,5.4,4.6,16.1,66.0,420.0,168.9,226.5,24.6,"San German, Puerto Rico M.S.A.",San German


**7. Use the same word formats for all table**

- Check other datasets and change city-name format 

In [12]:
for x in range(378):
    tmp = df.loc[x, "City"].split(" ")
    res = ""
    res = res.join(tmp)
    df.loc[x, "City"] = res
for x in range(378):
    tmp = df.loc[x, "City"].split(".")
    res = ""
    res = res.join(tmp)
    df.loc[x, "City"] = res

- replace some columns that have false datatype and change it back properly

In [13]:
df['ViolentCrime'] = df['ViolentCrime'].astype('str')
df['PropertyCrime'] = df['PropertyCrime'].astype('str')
df['Burglary'] = df['Burglary'].astype('str')
df['Theft'] = df['Theft'].astype('str')
for x in range(378):
    tmp = df.loc[x, "ViolentCrime"].replace(",","")
    df.loc[x, "ViolentCrime"] = tmp
    tmp = df.loc[x, "PropertyCrime"].replace(",","")
    df.loc[x, "PropertyCrime"] = tmp
    tmp = df.loc[x, "Burglary"].replace(",","")
    df.loc[x, "Burglary"] = tmp
    tmp = df.loc[x, "Theft"].replace(",","")
    df.loc[x, "Theft"] = tmp
df['ViolentCrime'] = df['ViolentCrime'].astype('float64')
df['PropertyCrime'] = df['PropertyCrime'].astype('float64')
df['Burglary'] = df['Burglary'].astype('float64')
df['Theft'] = df['Theft'].astype('float64')
df = df.dropna(axis = 0, how = "any")
df = df.reset_index(drop = True)

### Calculate 50th percentile of total crime

In [14]:
sum_column = df["ViolentCrime"] + df["Murder"] + df["Rape"] + df["Robbery"] + df["AggravatedAssault"] + df["PropertyCrime"] + df["Burglary"] + df["Theft"] + df["MotorVehicleTheft"]
df["TotalCrime"] = sum_column
res = df["TotalCrime"].quantile(0.5)
df = df.loc[df["TotalCrime"] < res]

### Export data

In [15]:
df.to_csv(r"./dataset/crime.csv", index = False)
print("Export completed")

Export completed
