# Crime Prediction using Tweets and KDE

In [8]:
%matplotlib inline

import os
import glob
import itertools
import functools

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm_notebook, tqdm


from utils.consts import START_DATE, END_DATE, KDE_BANDWITH, LDA_PARAMS, LDA_TOPICS, \
                         CSV_DATE_FORMART


from utils.geo import latlng2LDA_topics_chicago, latlng2LDA_sentment_chicago, \
                      generate_chicago_threat_grid_list, \
                      enrich_with_chicago_grid_200, \
                      CHICAGO_UTM_COORDS, FALSE_LABLE_DATASET_CELL_SIZE, \
                      N_CHICAGO_THREAT_GRID_LIST

In [9]:
RAW_CRIMES_DATA_PATH = os.path.join('data', 'raw', 'Crimes_-_2001_to_present.csv')
PROCESSED_CRIMES_DATA_PATH = os.path.join('data', 'processed', 'crime_data.csv')

RAW_TWEETS_DATA_WILDCARD_PATH = ('"' +
                                 os.path.join('data', 'raw', 'tweets', '*.json') +
                                 '"')
PROCESSED_TWEETS_DATA_PATH = os.path.join('data', 'processed', 'tweets_data.csv')

![title](./ANLP-Project-Pipeline.png)

## Research Time Frame

In [10]:
print(START_DATE, '--->', END_DATE)

2017-12-08 ---> 2018-02-19


## Data Sources & Preprocessing

### Chicago Crimes Incidents

In [11]:
if not os.path.exists(PROCESSED_CRIMES_DATA_PATH):
    !python3 ./preprocess_crimes_data.py {RAW_CRIMES_DATA_PATH}  {PROCESSED_CRIMES_DATA_PATH}

In [12]:
crimes_data = pd.read_csv(PROCESSED_CRIMES_DATA_PATH)
crimes_data['timestamp'] = pd.to_datetime(crimes_data['timestamp'], format=CSV_DATE_FORMART).dt.normalize()

In [13]:
len(crimes_data)

10902

In [14]:
crimes_data['timestamp'].agg(['min', 'max'])

min   2017-12-08
max   2018-02-19
Name: timestamp, dtype: datetime64[ns]

### Tweets

In [15]:
if not os.path.exists(PROCESSED_TWEETS_DATA_PATH):
    !python3 -W ignore ./preprocess_tweets_data.py {RAW_TWEETS_DATA_WILDCARD_PATH} {PROCESSED_TWEETS_DATA_PATH}

In [16]:
tweets_data = pd.read_csv(PROCESSED_TWEETS_DATA_PATH)
tweets_data['timestamp'] = pd.to_datetime(tweets_data['timestamp'], format=CSV_DATE_FORMART).dt.normalize()
tweets_data['tokens'] = tweets_data['tokens'].apply(lambda x: eval(x))

In [17]:
len(tweets_data)

79634

In [18]:
tweets_data['timestamp'].agg(['min', 'max'])

min   2017-12-08
max   2018-02-19
Name: timestamp, dtype: datetime64[ns]

## Feature Extract Functions

### KDE

In [19]:
from sklearn.neighbors.kde import KernelDensity

def train_KDE_model(train_df, bandwith=KDE_BANDWITH):
    '''
    Train KDE model.

    Input:
    train_df: train data frame with Latitude Logitude. 3 months prior data for the day of surveillance..

    Output:
    KDE Model
    '''
    
    kde = KernelDensity(bandwidth=bandwith,
                        metric='haversine',
                        kernel='gaussian',
                        algorithm='ball_tree')
    
    kde.fit(train_df[['latitude','longitude']])
    
    return kde

### TODO - visualization

### Sentiment Analysis

In [20]:
from utils.sentiment.sentiment import calculate_sentiment_tweet

Adding ngram features : ngram_range 2
Add bigram sentiment scores
Add unigram sentiment scores


In [21]:
tweets_data['sentiment'] = tweets_data['tokens'].apply(lambda x: calculate_sentiment_tweet(' '.join(x)))

### LDA

In [22]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def coalesce(token):
    '''
    Klaues: why this function?
    '''
    new_tokens = []
    for char in token:
        if len(new_tokens) < 2 or char != new_tokens[-1] or char != new_tokens[-2]:
            new_tokens.append(char)
    return ''.join(new_tokens)

def preprocess_tweet_for_LDA(raw_tokens):
    '''
    text input is one string
    output is tokenized and preprocessed(as defined below) text
    
    lowercase
    no hashtags or mentions
    any url converted to "url"
    replace multiple repeated chars with 2 of them. eg paaaarty -> paarty
    '''
    
    processed_tokens = []
    for token in raw_tokens:
        if token.startswith("@") or token.startswith("#"):
            continue
        elif token.startswith("https://") or token.startswith("http://"):
            processed_tokens.append("url")
        else:
            processed_tokens.append(coalesce(token))
            
    return processed_tokens

def train_LDA_model(docs, params=LDA_PARAMS, preprocessor=preprocess_tweet_for_LDA):
    
    vectorizer = CountVectorizer(stop_words="english",
                                 preprocessor=preprocessor,
                                 tokenizer=lambda x:x)
    
    lda_train_data = vectorizer.fit_transform(docs)
    
    lda_model = LatentDirichletAllocation(**params)

    lda_model.fit(lda_train_data)
    
    doc_topics = lda_model.transform(lda_train_data)
    
    vocabulary = vectorizer.get_feature_names()
    
    return lda_model, doc_topics, vocabulary

In [23]:
def get_topic_top_words_LDA(topic_index, lda_model, vocabulary, n_top_words):
    topic = lda_model.components_[topic_index]
    return [vocabulary[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
            
def print_top_words_LDA(lda_model, vocabulary, n_top_words):
    for topic_index in range(len(lda_model.components_)):
        
        message = "Topic #%d: " % topic_index
        message += " | ".join(get_topic_top_words_LDA(topic_index, lda_model, vocabulary, n_top_words))

        print(message)
    print()
    
# print_top_words_LDA(tweets_lda_model, tweets_vocabulary, 5)    



## Generate Train & Evaluatuin Datasets Functions

In [38]:
def generate_tweets_docs(tweets_data):
    tweet_docs_groupby = tweets_data.groupby(('latitude_index', 'longitude_index'))
    tweet_docs = tweet_docs_groupby['tokens'].apply(lambda r: list(r))
    tweet_docs = tweet_docs.sort_index()
    return tweet_docs, tweet_docs_groupby

In [39]:
def filter_dataset_by_date_window(dataset, start_end, end_date):
        return dataset[(dataset['timestamp'] >= start_end) &
                        ((dataset['timestamp'] <= end_date))]
    

def build_datasets_by_date_window(crimes_data, tweets_data, start_train_date, n_train_days):

    start_train_date = pd.to_datetime(start_train_date)
    end_train_date = start_train_date + pd.DateOffset(n_train_days)
    evaluation_date = end_train_date + pd.DateOffset(1)

    crimes_train_dataset = filter_dataset_by_date_window(crimes_data, start_train_date, end_train_date)
    tweets_train_dataset = filter_dataset_by_date_window(tweets_data, start_train_date, end_train_date)
    crimes_evaluation_dataset = filter_dataset_by_date_window(crimes_data, evaluation_date, evaluation_date)

    return crimes_train_dataset, tweets_train_dataset, crimes_evaluation_dataset


def generate_one_step_train_dataset(crimes_dataset, tweets_dataset):
    crimes_kde_model = train_KDE_model(crimes_dataset)
    
    tweets_docs, tweet_docs_groupby = generate_tweets_docs(tweets_dataset)
    
    average_sentiment_docs = tweet_docs_groupby['sentiment'].agg('mean')
    
    latlng2LDA_tweet_sentiment_chicago = functools.partial(latlng2LDA_sentment_chicago,
                                                average_sentiment_docs=average_sentiment_docs)
    
    tweets_lda_model, doc_topics, tweets_vocabulary = train_LDA_model((tweets_docs
                                                                       .apply(lambda r: sum(r, []))
                                                                       .tolist()))
    '''
    get_tweets_topic_top_words_LDA = functools.partial(get_topic_top_words_LDA,
                                                       lda_model=tweets_lda_model,
                                                       vocabulary=tweets_vocabulary,
                                                       n_top_words=5)
    '''
    
    latlng2LDA_tweet_topics_chicago = functools.partial(latlng2LDA_topics_chicago,
                                                    doc_topics=doc_topics,
                                                    docs=tweets_docs)
    
    train_dataset = pd.concat([enrich_with_chicago_grid_200(crimes_data[['latitude', 'longitude']]).assign(crime=True),
                            generate_chicago_threat_grid_list().assign(crime=False)],
                    axis=0)


    train_dataset = train_dataset[['latitude', 'longitude', 'latitude_index', 'longitude_index', 'crime']]

    train_dataset['KDE'] = crimes_kde_model.score_samples(
        train_dataset[['latitude', 'longitude']].as_matrix()
    )

    train_dataset['SENTIMENT'] = train_dataset.apply(lambda row: pd.Series(latlng2LDA_tweet_sentiment_chicago(
                                                                    row['latitude'],
                                                                    row['longitude'])),
                                        axis=1)

    
    train_dataset[LDA_TOPICS] = train_dataset.apply(lambda row: pd.Series(latlng2LDA_tweet_topics_chicago(
                                                                    row['latitude'],
                                                                    row['longitude'])),
                                        axis=1)

    features_cols = ['KDE', 'SENTIMENT'] + LDA_TOPICS

    train_dataset = {
                    'X'         : train_dataset[['latitude_index', 'longitude_index'] + features_cols],
                    'Y'         : train_dataset['crime'],
                    'KDE'       : crimes_kde_model,
                    'SENTIMENT' : average_sentiment_docs,
                    'LDA'       : {'model': tweets_lda_model,'vocabulary': tweets_vocabulary}
    }
    
    return train_dataset


def generate_one_step_evaluation_dataset(crimes_evaluation_dataset):
    evaluation_dataset = enrich_with_chicago_grid_200(crimes_evaluation_dataset)
    evaluation_dataset = evaluation_dataset[['latitude_index', 'longitude_index']]
    return evaluation_dataset


def generate_one_step_datasets(crimes_data, tweets_data, start_train_date, n_train_days):
    
    crimes_train_dataset, tweets_train_dataset, crimes_evaluation_dataset = build_datasets_by_date_window(crimes_data,
                                                                                                          tweets_data,
                                                                                                          start_train_date,
                                                                                                          n_train_days)
    train_dataset = generate_one_step_train_dataset(crimes_train_dataset, tweets_train_dataset)
    evaluation_dataset = generate_one_step_evaluation_dataset(crimes_evaluation_dataset)
    
    return train_dataset, evaluation_dataset


In [40]:
from sklearn.linear_model import LogisticRegression

def generate_threat_kde_dataset(train_dataset):
    threat_grid_cells = train_dataset['X'][~train_dataset['Y']]
    kde_values = threat_grid_cells[['latitude_index', 'longitude_index', 'KDE']]
    
    threat_kde_df = kde_values.set_index(['latitude_index', 'longitude_index'])['KDE']
    threat_kde_df = threat_kde_df.sort_values(ascending=False)
    
    return list(threat_kde_df.index)

def generate_threat_logreg_dataset(train_dataset):
    is_crime_count = train_dataset['Y'].value_counts()
    logreg_C = is_crime_count[False] / is_crime_count[True]
    logreg = LogisticRegression(C=logreg_C)
    logreg.fit(train_dataset['X'][['KDE', 'sentiment'] + LDA_TOPICS], train_dataset['Y'])
    
    threat_grid_cells = train_dataset['X'][~train_dataset['Y']]
    threat_grid_cells['logreg'] = logreg.predict_log_proba(threat_grid_cells[['KDE'] + LDA_TOPICS])[:, 1]
    
    logreg_values = threat_grid_cells[['latitude_index', 'longitude_index', 'logreg']]
    threat_logreg_df = logreg_values.set_index(['latitude_index', 'longitude_index'])['logreg']
    threat_logreg_df = threat_logreg_df.sort_values(ascending=False)
    
    return list(threat_logreg_df.index) 

def generate_threat_datasets(train_dataset):
    
    return [generate_threat_kde_dataset(train_dataset),
            generate_threat_logreg_dataset(train_dataset)]

In [41]:
def generate_surveillance_data(train_dataset, evaluation_dataset):

    surveillance_data = np.zeros((3, N_CHICAGO_THREAT_GRID_LIST))

    threat_datasets = generate_threat_datasets(train_dataset)

    crime_counts = evaluation_dataset.groupby(['latitude_index', 'longitude_index']).size()
    crime_counts = crime_counts.sort_values(ascending=False)

    # real crime occurence is our gold dataset
    threat_datasets.append(list(crime_counts.index))
    
    for threat_model_index, threat_dataset in enumerate(threat_datasets):
        for cell_index, (latitude_index, longitude_index) in enumerate(threat_dataset):
            surveillance_data[threat_model_index][cell_index] = crime_counts.get((latitude_index, longitude_index), 0)
        
    return surveillance_data
    
    
def generate_one_step_surveillance_data(crimes_data, tweets_data, start_train_date, n_train_days):
    
        train_dataset, evaluation_dataset = generate_one_step_datasets(crimes_data,
                                                                       tweets_data,
                                                                       start_train_date,
                                                                       n_train_days)

        surveillance_data = generate_surveillance_data(train_dataset,
                                                        evaluation_dataset)
        
        return surveillance_data

    
def generate_all_data_surveillance_data(crimes_data, tweets_data, n_train_days):
    surveillance_data = []#np.zeros((3, N_CHICAGO_THREAT_GRID_LIST))

    for start_train_date in tqdm(pd.date_range(START_DATE, END_DATE)[:-(n_train_days+1)][:3]):
        
        surveillance_data.append((start_train_date,  generate_one_step_surveillance_data(crimes_data,
                                                                  tweets_data,
                                                                  start_train_date,
                                                                  n_train_days)))


#        surveillance_data = surveillance_data.cumsum(axis=0)
#        surveillance_data /= surveillance_data.sum(axis=0)
        
    return surveillance_data

def plot_surveillance_data(surveillance_data):
    pass


In [43]:
train_dataset, evaluation_dataset = generate_one_step_datasets(crimes_data, tweets_data, START_DATE, 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


ValueError: Wrong number of items passed 42, placement implies 1

In [None]:

lda_params = LDA_PARAMS.copy()
lda_params['n_components'] = 500
tweets_lda_model, doc_topics, tweets_vocabulary = train_LDA_model((tweets_docs
                                                                   .apply(lambda r: sum(r, []))
                                                                   .tolist()),
                                                                 params=lda_params)


In [None]:
tt = [tweets_lda_model.components_[i]
      for i in range(LDA_PARAMS['n_components'])]

In [None]:
tweets_lda_model.components_[1]

In [None]:
[len(set(x)) for x in tt].count(1)

In [None]:
print_top_words_LDA(tweets_lda_model, tweets_vocabulary, 6)

In [None]:
print_top_words_LDA(train_dataset['LDA']['model'], train_dataset['LDA']['vocabulary'], 6)

In [None]:
crimes_data['longitude_index'].describe()

In [None]:
is_crime_count = train_dataset['Y'].value_counts()
logreg_C = is_crime_count[False] / is_crime_count[True]
logreg = LogisticRegression(C=logreg_C)
logreg.fit(train_dataset['X'][['KDE'] + LDA_TOPICS], train_dataset['Y'])

threat_grid_cells = train_dataset['X'][~train_dataset['Y']]
threat_grid_cells['logreg'] = logreg.predict_log_proba(threat_grid_cells[['KDE'] + LDA_TOPICS])[:, 1]

logreg_values = threat_grid_cells[['latitude_index', 'longitude_index', 'logreg']]
threat_logreg_df = logreg_values.set_index(['latitude_index', 'longitude_index'])['logreg']
threat_logreg_df = threat_logreg_df.sort_values(ascending=False)

In [None]:
plt.plot(((logreg.coef_[0]/logreg.coef_[0][0])))

In [None]:
sns.distplot(logreg.coef_[0])

In [None]:
logreg.coef_[0]XXX

In [None]:
w = q
q = w[0][1]

In [None]:
ww = w

In [None]:
(ww[0]-ww[1]).sum()

In [None]:
step = int(q.shape[1] / 100)
qq = (q.cumsum(axis=1) / q.sum(axis=1)[:,None])[:,::step]

plt.plot(qq[0])
plt.plot(qq[1][::step])
#plt.plot(qq[2])
plt#.plot(qq[1] - qq[0])
plt.xticks(range(0, 120, 20))
plt.yticks(np.arange(0, 1.2, 0.2), range(0, 120, 20))

In [None]:
qq[2,:]