<a href="https://colab.research.google.com/github/ilakesh/courtwatchNOLA/blob/gh-pages/cap_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NOLA Court Dockets Data Analysis

In [1]:
# Importing tools used for web scraping and data analysis
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import lxml as l
import datetime
import re
import logging
logger = logging.Logger('catch_all')

In [2]:
def text_extract(s):
  '''This extracts the text between fourth and fifth lines and creates a new entity in a second data frame with columns:
    1) mag id
    2) date
    3) text
    4) judge
    '''

### The main function which extracts all relevant information from each docket

In [3]:
def get_mag(soup):
    """WORKING REGEX TO GET MAG NUMBER"""
    mag_num = re.search(r'MAG#: (\d{6})', soup).group(1)
    return mag_num

In [4]:
def get_defendant_name(text):
    """Using split with new line as delimeter -- standard spacing on top half of docket"""
    defendant = text[9].strip()
    return defendant

In [5]:
def get_date_judge_chargestext(text):
    """Getting the date and judge for the proceeding and extracting the charges text"""
    # It is always two lines after the 4th equal sign line
    equalsignline = 0
    charges = []
    for (count,line) in enumerate(text):
        if line == "=" * 78:
            equalsignline +=1
        if equalsignline == 2:
            charges.append(text[count])
        elif equalsignline == 4:
            #print(mag_num, text[count+2]
            date, judge = re.sub('\*','',text[count+2]).split()
            break
    return date, judge, charges

In [6]:
def extract_charges_bond(charges):
    """Takes "charges" from above and extracts count, code, bond, and charge"""
    cnt = []
    code = []
    bond = []
    charge = []
    for i, line in enumerate(charges[3:-1]):
        #print("i and line:",i,line)
        if 'BOND:' in line:
            s = line.split()
            bond_ind = s.index("BOND:")
            cnt.append(s[0])
            code.append(" ".join(s[1:bond_ind]))
            bond.append(s[bond_ind +1])
        elif defendant in line:
            pass
        else:
            s = line.split()
            charge.append(" ".join(s[:]))
    return cnt, code, bond, charge

In [7]:
def create_docket_df_entry(mag_num, defendant, date, bond, judge, cnt, code, charge):
    # Duplicating the data that is the same for each charge on docket    
    mag_nums = [mag_num] * len(bond)
    defendants = [defendant] * len(bond)
    dates = [date] *len(bond)
    judges = [judge] * len(bond)
    #Create a dataframe using the extracted data
    docket = pd.DataFrame({"Mag Num": mag_nums,"Defendant": defendants, "Judge": judges, "Count": cnt, "Code": code,"Charge": charge, "Bond": bond, "Date": dates})
    return docket

### The following initializes the dataframe "dockets" and uses **get** request to scrape text from each docket.  Then the extract functions are called and the returned dataframe is added to the "dockets" dataframe

In [8]:
# The prefix of every url we are scraping from
base_url = 'https://www.opso.us/dcktmstr/555555.php?&domagn='

# A list corresponding to the magistrate numbers of each docket
#l = list(range(586900, 587000))
l = [586545]
soups = []
dockets = pd.DataFrame()
for num in l:
    r = requests.get(base_url + str(num))
    root = BeautifulSoup( r.content )
    soup = str(root.find('pre'))
    soup.__repr__()
    text = soup.split("\n")
    
    mag_num = get_mag(soup)
    defendant = get_defendant_name(text)
    date, judge, charges = get_date_judge_chargestext(text)
    cnt, code, bond, charge = extract_charges_bond(charges)
    
    dockets = dockets.append(create_docket_df_entry(mag_num, defendant, date, bond, judge, cnt, code, charge), ignore_index = True)
    except Exception as e:
        logger.error('Failed to upload to ftp: '+ str(e))
        print("Docket number {} is not working".format(str(num)))
display(dockets)

Unnamed: 0,Mag Num,Defendant,Judge,Count,Code,Charge,Bond,Date
0,586545,"SCOTT, REEVA N",DESALVOE,1,RS 14 56(A)(1),SIMPLE CRIMINAL DAMAGE TO PROPERTY,0.0,07/13/2020
1,586545,"SCOTT, REEVA N",DESALVOE,1,RS 14 34.9,BATTERY UPON DATING PARTNER,0.0,07/13/2020


In [9]:
## Initial cleaning of dataframe
# Removes "," from the bond value and typecasts bond to numeric
dockets["Bond"] = dockets["Bond"].str.replace(',','')
dockets["Bond"] = pd.to_numeric(dockets["Bond"])

In [10]:
display(dockets)

Unnamed: 0,Mag Num,Defendant,Judge,Count,Code,Charge,Bond,Date
0,586545,"SCOTT, REEVA N",DESALVOE,1,RS 14 56(A)(1),SIMPLE CRIMINAL DAMAGE TO PROPERTY,0.0,07/13/2020
1,586545,"SCOTT, REEVA N",DESALVOE,1,RS 14 34.9,BATTERY UPON DATING PARTNER,0.0,07/13/2020


In [11]:
dockets.groupby(["Charge"])["Bond"].mean()
dockets.groupby(["Charge"])["Bond"].count()
dockets.groupby(["Charge"])["Bond"].describe()
display(dockets)

Unnamed: 0,Mag Num,Defendant,Judge,Count,Code,Charge,Bond,Date
0,586545,"SCOTT, REEVA N",DESALVOE,1,RS 14 56(A)(1),SIMPLE CRIMINAL DAMAGE TO PROPERTY,0.0,07/13/2020
1,586545,"SCOTT, REEVA N",DESALVOE,1,RS 14 34.9,BATTERY UPON DATING PARTNER,0.0,07/13/2020


In [12]:
#dockets = pd.DataFrame({"Mag Num": mag_nums,"Defendant": defendants, "Judge": judges, "Count": cnt, "Code": code,"Charge": charge, "Bond": bond})

In [13]:
#display(dockets)

In [14]:
# # WORKING REGEX TO GET BOND
# bond = re.findall(r'BOND:\s*(.*)\n', soup)
# bond