In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")


In [2]:
# Grab and process the raw data.
data_path = ("known_offenses_nyc2013.csv")


In [3]:
data_raw = pd.read_csv(data_path)

In [4]:
data_raw

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Unnamed: 13
0,Adams Village,1861,0,0.000,,0,0,0,12,2,10,0,0.000,
1,Addison Town and Village,2577,3,0.000,,0,0,3,24,3,20,1,0.000,
2,Akron Village,2846,3,0.000,,0,0,3,16,1,15,0,0.000,
3,Albany,97956,791,8.000,,30,227,526,4090,705,3243,142,,
4,Albion Village,6388,23,0.000,,3,4,16,223,53,165,5,,
5,Alfred Village,4089,5,0.000,,0,3,2,46,10,36,0,,
6,Allegany Village,1781,3,0.000,,0,0,3,10,0,10,0,0.000,
7,Amherst Town,118296,107,1.000,,7,31,68,2118,204,1882,32,3.000,
8,Amityville Village,9519,9,0.000,,2,4,3,210,16,188,6,1.000,
9,Amsterdam,18182,30,0.000,,0,12,18,405,99,291,15,0.000,


In [5]:
data_raw.columns

Index(['City', 'Population', 'Violent\rcrime',
       'Murder and\rnonnegligent\rmanslaughter',
       'Rape\r(revised\rdefinition)1', 'Rape\r(legacy\rdefinition)2',
       'Robbery', 'Aggravated\rassault', 'Property\rcrime', 'Burglary',
       'Larceny-\rtheft', 'Motor\rvehicle\rtheft', 'Arson3', 'Unnamed: 13'],
      dtype='object')

In [6]:
data_raw.columns

Index(['City', 'Population', 'Violent\rcrime',
       'Murder and\rnonnegligent\rmanslaughter',
       'Rape\r(revised\rdefinition)1', 'Rape\r(legacy\rdefinition)2',
       'Robbery', 'Aggravated\rassault', 'Property\rcrime', 'Burglary',
       'Larceny-\rtheft', 'Motor\rvehicle\rtheft', 'Arson3', 'Unnamed: 13'],
      dtype='object')

In [7]:
data_raw.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Unnamed: 13
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0,
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0,
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0,
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,,


In [8]:
# Create a new df with variables city, population, murder, robbery
# data.iloc[:, 0:2] # first two columns of data frame with all rows
mv_data = data_raw.iloc[:,0:7]

In [9]:
mv_data.columns

Index(['City', 'Population', 'Violent\rcrime',
       'Murder and\rnonnegligent\rmanslaughter',
       'Rape\r(revised\rdefinition)1', 'Rape\r(legacy\rdefinition)2',
       'Robbery'],
      dtype='object')

In [10]:
mv_data

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery
0,Adams Village,1861,0,0.000,,0,0
1,Addison Town and Village,2577,3,0.000,,0,0
2,Akron Village,2846,3,0.000,,0,0
3,Albany,97956,791,8.000,,30,227
4,Albion Village,6388,23,0.000,,3,4
5,Alfred Village,4089,5,0.000,,0,3
6,Allegany Village,1781,3,0.000,,0,0
7,Amherst Town,118296,107,1.000,,7,31
8,Amityville Village,9519,9,0.000,,2,4
9,Amsterdam,18182,30,0.000,,0,12


In [11]:
# And now, the murder and rape columns are back and identical
mv_data

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery
0,Adams Village,1861,0,0.000,,0,0
1,Addison Town and Village,2577,3,0.000,,0,0
2,Akron Village,2846,3,0.000,,0,0
3,Albany,97956,791,8.000,,30,227
4,Albion Village,6388,23,0.000,,3,4
5,Alfred Village,4089,5,0.000,,0,3
6,Allegany Village,1781,3,0.000,,0,0
7,Amherst Town,118296,107,1.000,,7,31
8,Amityville Village,9519,9,0.000,,2,4
9,Amsterdam,18182,30,0.000,,0,12


In [12]:
# a = '1,000,000'
# int(a.replace(',', ''))
# (int(mv_data['Population'][0].replace(',','')))**2
square_it = (int(mv_data['Population'][5].replace(',','')))**2
print(type(square_it))
print("The sixth element: ", square_it)

<class 'int'>
The sixth element:  16719921


In [13]:
# Clean up null values
mv_data[mv_data.isnull().any(axis=1)]

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery
0,Adams Village,1861,0,0.000,,0,0
1,Addison Town and Village,2577,3,0.000,,0,0
2,Akron Village,2846,3,0.000,,0,0
3,Albany,97956,791,8.000,,30,227
4,Albion Village,6388,23,0.000,,3,4
5,Alfred Village,4089,5,0.000,,0,3
6,Allegany Village,1781,3,0.000,,0,0
7,Amherst Town,118296,107,1.000,,7,31
8,Amityville Village,9519,9,0.000,,2,4
9,Amsterdam,18182,30,0.000,,0,12


In [14]:
# Drops the Rape1 column, full of NaN values
mv_data.dropna(axis=1, how='all')

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (legacy definition)2,Robbery
0,Adams Village,1861,0,0.000,0,0
1,Addison Town and Village,2577,3,0.000,0,0
2,Akron Village,2846,3,0.000,0,0
3,Albany,97956,791,8.000,30,227
4,Albion Village,6388,23,0.000,3,4
5,Alfred Village,4089,5,0.000,0,3
6,Allegany Village,1781,3,0.000,0,0
7,Amherst Town,118296,107,1.000,7,31
8,Amityville Village,9519,9,0.000,2,4
9,Amsterdam,18182,30,0.000,0,12


In [15]:
# Get rid of any row w/ more than two NaN values
mv_data.dropna(thresh=2)

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery
0,Adams Village,1861,0,0.000,,0,0
1,Addison Town and Village,2577,3,0.000,,0,0
2,Akron Village,2846,3,0.000,,0,0
3,Albany,97956,791,8.000,,30,227
4,Albion Village,6388,23,0.000,,3,4
5,Alfred Village,4089,5,0.000,,0,3
6,Allegany Village,1781,3,0.000,,0,0
7,Amherst Town,118296,107,1.000,,7,31
8,Amityville Village,9519,9,0.000,,2,4
9,Amsterdam,18182,30,0.000,,0,12


In [16]:
pull_commas = [pd.to_numeric(mv_data['Population'].replace(',',''),errors='ignore') for pop in mv_data['Population']]

In [17]:
pull_commas.count

<function list.count>

In [18]:
mv_data['Murder_Cat'] = np.where((mv_data['Murder and\rnonnegligent\rmanslaughter'] > 0),1,0)

In [19]:
# Convert mv_data['Robbery'] to number, then categorical
mv_data['Robbery_Num'] = mv_data['Robbery_Num'] = pd.to_numeric(mv_data['Robbery'].replace(',',''),errors='ignore')

In [20]:
mv_data['Robbery_Cat'] = np.where((mv_data['Murder and\rnonnegligent\rmanslaughter'] > 0),1,0)

In [21]:
# Still need to convert below to a number from string
square_it = (int(mv_data['Population'][5].replace(',','')))**2
print(square_it)

16719921


In [22]:
mv_data['Population_Num'] = pd.to_numeric(mv_data['Population'].replace(',',''),errors='coerce')

In [23]:
# pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
mv_data['Population_Num'] = pd.to_numeric(mv_data['Population'].str.replace(',',''))

In [24]:
mv_data['Pop_Squared']=mv_data['Population_Num']**2

In [25]:
mv_data

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Murder_Cat,Robbery_Num,Robbery_Cat,Population_Num,Pop_Squared
0,Adams Village,1861,0,0.000,,0,0,0,0,0,1861.000,3463321.000
1,Addison Town and Village,2577,3,0.000,,0,0,0,0,0,2577.000,6640929.000
2,Akron Village,2846,3,0.000,,0,0,0,0,0,2846.000,8099716.000
3,Albany,97956,791,8.000,,30,227,1,227,1,97956.000,9595377936.000
4,Albion Village,6388,23,0.000,,3,4,0,4,0,6388.000,40806544.000
5,Alfred Village,4089,5,0.000,,0,3,0,3,0,4089.000,16719921.000
6,Allegany Village,1781,3,0.000,,0,0,0,0,0,1781.000,3171961.000
7,Amherst Town,118296,107,1.000,,7,31,1,31,1,118296.000,13993943616.000
8,Amityville Village,9519,9,0.000,,2,4,0,4,0,9519.000,90611361.000
9,Amsterdam,18182,30,0.000,,0,12,0,12,0,18182.000,330585124.000


In [26]:
mv_data.columns

Index(['City', 'Population', 'Violent\rcrime',
       'Murder and\rnonnegligent\rmanslaughter',
       'Rape\r(revised\rdefinition)1', 'Rape\r(legacy\rdefinition)2',
       'Robbery', 'Murder_Cat', 'Robbery_Num', 'Robbery_Cat', 'Population_Num',
       'Pop_Squared'],
      dtype='object')

In [27]:
mv_finalversion = mv_data[:-3]

In [28]:
mv_finalversion

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Murder_Cat,Robbery_Num,Robbery_Cat,Population_Num,Pop_Squared
0,Adams Village,1861,0,0.000,,0,0,0,0,0,1861.000,3463321.000
1,Addison Town and Village,2577,3,0.000,,0,0,0,0,0,2577.000,6640929.000
2,Akron Village,2846,3,0.000,,0,0,0,0,0,2846.000,8099716.000
3,Albany,97956,791,8.000,,30,227,1,227,1,97956.000,9595377936.000
4,Albion Village,6388,23,0.000,,3,4,0,4,0,6388.000,40806544.000
5,Alfred Village,4089,5,0.000,,0,3,0,3,0,4089.000,16719921.000
6,Allegany Village,1781,3,0.000,,0,0,0,0,0,1781.000,3171961.000
7,Amherst Town,118296,107,1.000,,7,31,1,31,1,118296.000,13993943616.000
8,Amityville Village,9519,9,0.000,,2,4,0,4,0,9519.000,90611361.000
9,Amsterdam,18182,30,0.000,,0,12,0,12,0,18182.000,330585124.000
