In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile

In [None]:
states = pd.read_csv("data/county_and_state.csv")
populations = pd.read_csv("data/county_and_population.csv")

display(states)
display(populations)

In [None]:
states.merge(populations, left_on="County", right_on="County")

In [None]:
def canonicalize_county(county_series):
  return (county_series.str.lower()
          .str.replace('', '')
          .str.replace('&', 'and')
          .str.replace('.', '')
          .str.replace('county', '')
          .str.replace('parish', '')
          )

display(canonicalize_county(states["County"]))
display(canonicalize_county(populations["County"]))

In [None]:
states["Canonical County"] = canonicalize_county(states["County"])
populations["Canonical County"] = canonicalize_county(populations["County"])
display(states)
display(populations)

In [None]:
states.merge(populations, on = "Canonical County")

In [None]:
log_fname = 'data/log.txt'
with open(log_fname, 'r') as f:
  log_lines = f.readlines()
log_lines

In [None]:
log_lines[0][20:31]

In [None]:
first = log_lines[0]
first

In [None]:
pertinent = (
    first.split("[")[1]
    .split(']')[0]
)
pertinent

In [None]:
day, month, rest = pertinent.split('/')

In [None]:
year, hour, minute, rest = rest.split(':')

print("Year:", year)
print("Hour:", hour)
print("Minute:", minute)
print("Rest:", rest)

In [None]:
seconds, time_zone = rest.split('')
day, month, year, hour, minute, seconds, time_zone

In [None]:
logs = pd.read_csv("data/log.txt",
                   sep="\t",
                   header = None)[0]

print("Original input!")
display(logs)

In [None]:
import re

text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";

pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"

re.findall(pattern, text)

In [None]:
df_ssn = pd.DataFrame(
    ['987-65-4321',
     'forty',
     '123-45-6789 bro or 321-45-6789',
     '999-99-9999'],
    columns=['SSN'])
df_ssn

In [None]:
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn['SSN'].str.findall(pattern)

In [None]:
(
df_ssn['SSN']
.str.findall(pattern)
.str[-1]
)

In [None]:
text = """I will meet you at 08:30:00 pm tomorrow"""
pattern = ".*(\d\d):(\d\d):(\d\d).*"
matches = re.findall(pattern, text)
matches

In [None]:
hour, minute, second= matches[0]
print("Hour:", hour)
print("Minute: ", minute)
print("Second: ",second)

In [None]:
df_ssn

In [None]:
pattern_group_mult = r"([0-9]{3})-([0-9]{2})-([0-9]{4})"
df_ssn['SSN'].str.extract(pattern_group_mult)

In [None]:
ssns = df_ssn['SSN']

snss.str.extract(pattern_group_mult)


In [None]:
df_ssn['SSN'].str.extractall(pattern_group_mult)


In [None]:
text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)

In [None]:
df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                        '<a href="https://ds100.org">Link</a>',
                        '<b>Bold text</b>'], columns= ['Html'])

df_html

In [None]:
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()


In [None]:
line = log_lines[0]
display(line)

Pattern= r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern,line)[0]
day, month, year, hour, minute, second, time_zone

In [None]:
df= pd.DataFrame(log_lines, columns=['Log'])
df

In [None]:
pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
df['Log'].str.findall(pattern)

In [None]:
df['Log'].str.extractall(pattern)

In [None]:
vio = pd.read_csv('data/violations.csv', head = 0, names=['bid', 'date', 'desc'])
desc = vio['desc']
vio.head()

In [None]:
counts= desc.value_counts()

counts.shape

In [None]:
counts[:10]

In [None]:
counts[50:60]

In [None]:
vio['clean_desc'] = (vio['desc'].str.replace(r'\s*\[.*\]$', '',regex = True)
            .str.strip()
            .str.lower())
vio.head()

In [None]:
vio['clean_desc'].value_counts().shape

In [None]:
vio['clean_desc'].value_counts().head()

In [None]:
with_features = (vio.assign(is_unclean = vio['clean_desc'].str.contains('clean|sanit')
.assign(is_high_risk = vio['clean_desc'].str.contains('high risk'))
.assign(is_vermin = vio['clean_desc'].str.contains('vermin'))
.assign(is_surface = vio['clean_desc'].str.contains('wall|ceiling|floor|surface'))
.assign(is_human = vio['clean_desc'].str.contains('hand|glove|hair|nail'))
.assign(is_permit = vio['clean_desc'].str.contains('permit|certif'))
)
with_features.head()

In [None]:
count_features = (with_features.groupby(['bid','date']).sum(numeric_only=True).reset_index())

count_features.iloc[255:260,:]


In [None]:
count_features[count_features['is_vermin']>1.head(5)


In [None]:

violation_type_df = pd.melt(count_features,id_vars=['bid','date'],var_name='feature',value_name='num_vios')

violation_type_df[(violation_type_df['bid'] == 489) & (violation_type_df['date'] == 20150728)]


In [None]:
inspection_df = pd.read_csv('data/inspections.csv',
                            header=0,
                            usecols=[0,1,2],
                            names=['bid','score','date'])
inspection_df.head()

In [None]:
violation_type_and_scores = (
    violation_type_df
    .merge(inspection_df,on=['bid','date']))

violation_type_and_scores.head(12)

In [None]:
sns.catplot(x='num_vios',y='score',
            col='feature',col_wrap=2,
            kind='box',
            data=violation_type_and_scores);