# Crime Prediction using Tweets and KDE

In [None]:
%matplotlib inline

import os
import glob
import itertools
import functools
import collections

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm_notebook, tqdm

from utils.consts import START_DATE, END_DATE, \
                         RAW_CRIMES_DATA_PATH, PROCESSED_CRIMES_DATA_PATH, \
                         RAW_TWEETS_DATA_WILDCARD_PATH, PROCESSED_TWEETS_DATA_PATH, \
                         CSV_DATE_FORMART

from utils.lda import print_top_words_LDA, print_top_words_LDA, get_topic_top_words_LDA

from utils.surveillance import generate_all_data_surveillance_data, \
                               generate_one_step_datasets, generate_surveillance_data, calc_AUCs
    
from utils.visualization import plot_contour, plot_scatter, plot_imshow, plot_log_reg_coef, plot_surveillance_data


In [None]:
import warnings
warnings.simplefilter('ignore')

![title](./ANLP-Project-Pipeline.png)

## Research Time Frame

In [None]:
print(START_DATE, '--->', END_DATE)

## Data Sources & Preprocessing

### Chicago Crimes Incidents

In [None]:
if not os.path.exists(PROCESSED_CRIMES_DATA_PATH):
    !python3 ./preprocess_crimes_data.py {RAW_CRIMES_DATA_PATH}  {PROCESSED_CRIMES_DATA_PATH}

In [None]:
crimes_data = pd.read_csv(PROCESSED_CRIMES_DATA_PATH)
crimes_data['timestamp'] = pd.to_datetime(crimes_data['timestamp'], format=CSV_DATE_FORMART).dt.normalize()

In [None]:
len(crimes_data)

In [None]:
crimes_data['timestamp'].agg(['min', 'max'])

### Tweets

In [None]:
if not os.path.exists(PROCESSED_TWEETS_DATA_PATH):
    !python3 -W ignore ./preprocess_tweets_data.py {RAW_TWEETS_DATA_WILDCARD_PATH} {PROCESSED_TWEETS_DATA_PATH}

In [None]:
tweets_data = pd.read_csv(PROCESSED_TWEETS_DATA_PATH)
tweets_data['timestamp'] = pd.to_datetime(tweets_data['timestamp'], format=CSV_DATE_FORMART).dt.normalize()
tweets_data['tokens'] = tweets_data['tokens'].apply(lambda x: eval(x))

In [None]:
len(tweets_data)

In [None]:
tweets_data['timestamp'].agg(['min', 'max'])

### Enreaching Tweets with Sentiment Analysis 

In [None]:
from utils.sentiment.sentiment import calculate_sentiment_tweet

In [None]:
tweets_data['sentiment'] = tweets_data['tokens'].apply(lambda x: calculate_sentiment_tweet(' '.join(x)))

### one month

In [None]:
train_dataset, evaluation_dataset = generate_one_step_datasets(crimes_data,
                                                               tweets_data,
                                                               START_DATE,
                                                               31)

## KDE

In [None]:
plot_scatter(train_dataset['X'][train_dataset['Y']][['latitude', 'longitude']])

In [None]:
plot_contour(train_dataset['KDE'])

### Sentiment

In [None]:
plt.xlabel('sentiment value')
plt.ylabel('sentiment count')
plt.title('Sentiment Histogram & Distribution over Geo Documents')
sns.distplot(train_dataset['SENTIMENT'], norm_hist=True)

In [None]:
# plot_imshow(train_dataset, 'SENTIMENT')

## LDA

In [None]:
print_top_words_LDA(train_dataset['LDA']['model'], train_dataset['LDA']['vocabulary'], 5)

In [None]:
np.argsort(train_dataset['LDA']['model'].components_.std(axis=1))[-5:]


In [None]:
exmple_topic_id = np.argsort(train_dataset['LDA']['model'].components_.std(axis=1))[-1]
print(exmple_topic_id)
example_topic_column_name = 'T{:03}'.format(exmple_topic_id)

In [None]:
get_topic_top_words_LDA(exmple_topic_id, train_dataset['LDA']['model'], train_dataset['LDA']['vocabulary'], 15)

In [None]:
# plot_imshow(train_dataset['X'][~train_dataset['Y']], example_topic_column_name)

## Prediction

In [None]:
surveillance_data, threat_datasets = generate_surveillance_data(train_dataset,
                                                                evaluation_dataset)

### Logic Regression Coefs

In [None]:
plot_log_reg_coef(threat_datasets, 'SENTIMENT')


In [None]:
plot_log_reg_coef(threat_datasets, 'LDA')

In [None]:
plot_log_reg_coef(threat_datasets, 'SENTIMENT+LDA')



In [None]:
get_topic_top_words_LDA(308, train_dataset['LDA']['model'], train_dataset['LDA']['vocabulary'], 15)


### Threat Maps

In [None]:
# plot_imshow(threat_datasets['KDE']['df'], 'KDE')




In [None]:
# plot_imshow(threat_datasets['SENTIMENT']['df'], 'SENTIMENT')




In [None]:
# plot_imshow(threat_datasets['LDA']['df'], 'LDA')


In [None]:
# plot_imshow(threat_datasets['SENTIMENT+LDA']['df'], 'SENTIMENT+LDA')


## Surveillance Plot & AUC

In [None]:
normalized_surveillance_data = surveillance_data.cumsum(axis=1) / surveillance_data.sum(axis=1)[:, None]

plot_surveillance_data(normalized_surveillance_data, threat_datasets.keys())
calc_AUCs(normalized_surveillance_data, threat_datasets.keys())

## Full training

In [None]:
if True:
    agg_surveillance_data, all_threat_datasets = generate_all_data_surveillance_data(crimes_data, tweets_data, 31)
    
    
    
    

In [None]:
plot_surveillance_data(agg_surveillance_data, all_threat_datasets.keys[0]())
calc_AUCs(agg_surveillance_data, all_threat_datasets[0].keys())