# ~ ~ ~ ~ DFW PYTHONEERS~ ~ ~ ~ 
# Pandas Tutorial
### https://pandas.pydata.org/index.html



In [1]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))


In [2]:
import pandas as pd
pd.__version__

'0.23.4'

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Dataset: Stanford Open Policing Project

# https://openpolicing.stanford.edu/

## Metadata: https://github.com/5harad/openpolicing/blob/master/DATA-README.md


#                                                  TUTORIALS

In [4]:
%%time
# Reading the CSV file in the dataframe df
df= pd.read_csv('police.csv')

Wall time: 449 ms


In [5]:
df.head()
#df.tail()

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [6]:
df.shape
#df.shape[0]
#df.shape[1]

(91741, 15)

In [7]:
# Alternatively you can find out the # of records from len function
len(df)

91741

In [8]:
df.dtypes

stop_date              object
stop_time              object
county_name           float64
driver_gender          object
driver_age_raw        float64
driver_age            float64
driver_race            object
violation_raw          object
violation              object
search_conducted         bool
search_type            object
stop_outcome           object
is_arrested            object
stop_duration          object
drugs_related_stop       bool
dtype: object

In [9]:
df.columns

Index(['stop_date', 'stop_time', 'county_name', 'driver_gender',
       'driver_age_raw', 'driver_age', 'driver_race', 'violation_raw',
       'violation', 'search_conducted', 'search_type', 'stop_outcome',
       'is_arrested', 'stop_duration', 'drugs_related_stop'],
      dtype='object')

### Series, Chain & Math operation

In [10]:
# Series
df['driver_age'].head(5)

0    20.0
1    40.0
2    33.0
3    19.0
4    21.0
Name: driver_age, dtype: float64

In [11]:
h = df.head()
h

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [12]:
# Performing Mathematical operations
h['driver_age'] * 1000
#h['driver_age'] //10 

0    20000.0
1    40000.0
2    33000.0
3    19000.0
4    21000.0
Name: driver_age, dtype: float64

### Filtering

In [13]:
h.driver_age > 20

0    False
1     True
2     True
3    False
4     True
Name: driver_age, dtype: bool

In [14]:
#h[h.driver_age > 20]
h[(h.driver_age > 20) & (h.driver_age < 40)]

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [15]:
h[(h.driver_age > 20) & (h.driver_age < 40)].sort_values('driver_age')

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


### Introducing Set_index(['column']) function

In [16]:
df.set_index(['stop_date']).head()

Unnamed: 0_level_0,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
stop_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [17]:
df.set_index(['stop_date', 'stop_time']).sort_index().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
stop_date,stop_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1/1/2006,11:10,,M,1958.0,48.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1/1/2006,13:05,,M,1972.0,34.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1/1/2006,14:10,,M,1983.0,23.0,White,Registration Violation,Registration/plates,True,Incident to Arrest,Arrest Driver,True,16-30 Min,False
1/1/2006,17:18,,M,1974.0,32.0,White,Other Traffic Violation,Moving violation,False,,Warning,False,0-15 Min,False
1/1/2006,1:20,,M,1981.0,25.0,White,Other Traffic Violation,Moving violation,False,,Citation,False,0-15 Min,False


### Reset Index()

In [None]:
df.reset_index('stop_time')

### Introducing loc function

In [None]:
df.loc([''])

In [None]:
#df.loc['Sleuth', 1972]

### Group by

In [None]:
c = df
c.head(1)

In [None]:
grp_obj = c.groupby(['driver_age'])
grp_obj.is_arrested.sum().sort_index()

In [None]:
grp_obj2 = c.groupby(['driver_gender','driver_age', ])

In [None]:
grp_obj2.search_conducted.agg(['sum', 'size', 'min', 'max', 'mean'])

### Finding out the total null values in the various columns

In [None]:
df.isnull().sum()

In [None]:
True == 1

In [None]:
df1.isnull()

# TASKS/EXERCISES

### TASK: Remove the columns that only contains the missing values

In [None]:
df.drop('county_name', axis = 'columns', inplace = True)

In [None]:
#Alternatively: 
df = df.drop('county_name', axis = 'columns')
#Previous approach is more memory efficient and pythonic

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Equivalent method to do that is
df.dropna(axis='columns', how='all').shape

In [None]:
#Key things that we learnt:
    #Multiple ways to do the same in python


 ### TASK: Do men or women speed more often?
 
 ### Columns we will use: driver gender, violation

In [None]:
# Filter for speeding --> Chaining to select --> driver gender --> value counts

df[df.violation == 'Speeding'].driver_gender.value_counts(normalize = True)

In [None]:
# For Males
df[df.driver_gender == 'M'].violation.value_counts(normalize = True)

In [None]:
# For Females
df[df.driver_gender == 'F'].violation.value_counts(normalize = True)

In [None]:
# Doing it in one line of code
df.groupby('driver_gender').violation.value_counts(normalize = True)

In [None]:
#another way to do it is
df.loc[df['violation'] == 'Speeding', 'driver_gender'].value_counts(normalize = True)

### Task: Does gender affect who gets searched during a stop

In [None]:
df.head()

### Columns we will use: driver_gender, search_conducted

In [None]:
df.search_conducted.value_counts(normalize = True)

In [None]:
df.groupby('driver_gender').search_conducted.mean()

#### Value counts drop the Nan by default

### Task: During the search, how often is the driver frisked

In [None]:
df.search_type.value_counts(dropna = False)

### Pandas String Method to apply across the Pandas data series

### Provide the link of Str functions and talk about it

In [None]:
df['frisk'] = df.search_type.str.contains('Protective Frisk')
df.frisk.value_counts(dropna = False)

In [None]:
df.frisk.sum()

In [None]:
df.frisk.mean() # ignores the Nan (missing) values and Pandas mostly ignores the missing values all over the place.

### TASK: Which year had the least number of stops

### Columns we will use: stop_date

In [None]:
df.head(1)

In [None]:
combined_date_time = df.stop_date.str.cat(df.stop_time, sep= ' ')

In [None]:
combined_date_time.head() # 

In [None]:
# Introducing datetime functions
df['stop_date_time'] = pd.to_datetime(combined_date_time)

In [None]:
df.dtypes

### Introducing dt functions

In [None]:
df.stop_date_time.dt.year.value_counts()

### TASK: How does drug activity change by time of the day?

### Columns we will use: stop_date_time, drugs_related_stop

In [None]:
df.groupby(df.stop_date_time.dt.hour).drugs_related_stop.mean()

In [None]:
df.groupby(df.stop_date_time.dt.hour).drugs_related_stop.mean().plot()