In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import statsmodels.formula.api as sm

# black magic
%matplotlib inline

We'll also need the timezone package, to deal with times in UTC.

In [None]:
import pytz

## Start by Preparing the Temperature Data

In [None]:
# Load
df = pd.read_csv("weather_2013-2016")

# Drop a few bogus values
df = df[df['TemperatureF'] != -9999.0]
df["PrecipitationIn"].fillna(0, inplace = True)

# Set the time as the index and resample
df = df.set_index(pd.DatetimeIndex(df['DateUTC']))

**Let's convert from UTC to Central Time.**

This must follow the resampling, since there's a bug in resampling over DST borders.

In [None]:
ts_utc = df.index.tz_localize(pytz.UTC)
df = df.set_index(ts_utc.tz_convert(pytz.timezone('US/Central')))

N.B., faster, less readable resampling alternative:

```
df = df.set_index(pd.DatetimeIndex(df['DateUTC']).tz_localize(pytz.UTC))
df = df.resample('2H').mean()
df = df.set_index(df.index.tz_convert(pytz.timezone('US/Central')))
```

In [None]:
evening_temp = df[df.index.hour >= 18]
evening_temp_agg = evening_temp.groupby(evening_temp.index.date).mean()
evening_temp_agg.index.rename("Date", inplace = True)

evening_temp_agg[["TemperatureF", "PrecipitationIn"]]\
  .rename(columns = {"TemperatureF" : "Temperature [F]", "PrecipitationIn" : "Precipitation [In]"})\
  .to_csv("evening_temp_agg.csv")

In [None]:
evening_temp_agg.plot(use_index = True, y = "TemperatureF", legend = False, title = 'Temperature [F]')

## Now Prepare Crime on the Same Index

In [None]:
if False: # Do this from scratch -- long operation

    crime_df = pd.read_csv("chicago_crime.csv", parse_dates = {'time': ['Date']}, index_col = 'time')
    crime_df = crime_df[crime_df['Primary Type'] != 'DECEPTIVE PRACTICE']

    crime_cases = crime_df[['Case Number']]
    evening_crime = crime_cases[crime_cases.index.hour >= 18]
    evening_crime_agg = evening_crime.groupby(pd.DatetimeIndex(evening_crime.index.date)).count()
    evening_crime_agg.index.rename("Date", inplace = True)
    
    evening_crime_agg.to_csv("evening_crime_agg.csv")


`# crime_df = crime_df.set_index(pd.DatetimeIndex(crime_df['Date']).tz_localize(pytz.timezone('US/Central'))`

In [None]:
evening_crime_agg = pd.read_csv("evening_crime_agg.csv", index_col = 'Date', parse_dates = ['Date'])

## Now Merge Them

In [None]:
merge_df = evening_crime_agg.join(evening_temp_agg[['TemperatureF', 'PrecipitationIn']])
merge_df.rename(columns={'TemperatureF': 'Temperature [F]', 'Case Number': 'Number of Crimes', 'PrecipitationIn':'Precipitation [In]'}, inplace=True)

### Plot it.

In [None]:
fig, ax = plt.subplots()
sns.regplot(data = merge_df, x = 'Temperature [F]', y = "Number of Crimes", ax = ax)
sns.regplot(data = merge_df[merge_df['Precipitation [In]'] != 0], x = 'Temperature [F]', y = "Number of Crimes", color = 'r', ax = ax)

In [None]:
stats.ttest_ind(merge_df[merge_df['Precipitation [In]'] == 0]["Number of Crimes"], 
                merge_df[merge_df['Precipitation [In]'] != 0]["Number of Crimes"])

### Which Days of the Week are Worst?

In [None]:
merge_df['Day of Week'] = merge_df.index.dayofweek

fig, ax = plt.subplots(figsize=(8,6))
for label, df in merge_df.groupby(merge_df.index.dayofweek):
    df.plot(kind = 'kde', sharex = True, x = 'Day of Week', y = 'Number of Crimes', ax=ax, label=label)

merge_df.groupby(merge_df.index.dayofweek).mean()

### Create Lagged Shift Column

In [None]:
merge_df['Lagged Shift [F]'] = merge_df['Temperature [F]'] - merge_df['Temperature [F]'].rolling(window = 3, center = False).mean().shift(1)

merge_df['Split Window [F]'] = merge_df['Temperature [F]'] - \
                                0.5 * (merge_df['Temperature [F]'].rolling(window = 3, center = False).mean().shift(-3) +
                                       merge_df['Temperature [F]'].rolling(window = 3, center = False).mean().shift(1))

merge_df.dropna(inplace = True)
    
# merge_df = evening_crime_agg.join(evening_temp_agg[['Lagged Shift [F]', "PrecipitationIn"]])
# merge_df.rename(columns={'Case Number': 'Number of Crimes'}, inplace=True)
# merge_df.plot(kind='scatter', x = 'Lagged Shift [F]', y = 'Number of Crimes')

In [None]:
fig, ax = plt.subplots()
sns.regplot(data = merge_df, x = 'Split Window [F]', y = "Number of Crimes", ax = ax)
sns.regplot(data = merge_df[merge_df['Precipitation [In]'] != 0], x = 'Split Window [F]', y = "Number of Crimes", ax = ax)

fig, ax = plt.subplots()
sns.regplot(data = merge_df, x = 'Lagged Shift [F]', y = "Number of Crimes", ax = ax)
sns.regplot(data = merge_df[merge_df['Precipitation [In]'] != 0], x = 'Lagged Shift [F]', y = "Number of Crimes", ax = ax)

In [None]:
merge_df.rename(columns={'Lagged Shift [F]' : 'ShiftF', 'Number of Crimes' : 'NCrimes'}, inplace=True)

model = sm.ols(formula = 'NCrimes ~ ShiftF', data = merge_df).fit()
model.summary()

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(merge_df['ShiftF'], merge_df['NCrimes'])
print(slope/std_err)

In [None]:
pd.tools.plotting.scatter_matrix(merge_df[['ShiftF', 'Split Window [F]', 'NCrimes']])