# Vast Mini Challenge 03

This notebook aims to perform an exploratory analysis of the data from the _VAST Mini Challenge 03_, in order to answer the three questions proposed through data visualization. 

In [2]:
import pandas as pd
import spacy as sp
from datetime import datetime
from nltk.corpus import stopwords
import plotly.express as px
import altair as alt

## Preprocessing
The preprocessing includes punctuation and stopword removal and lemmatization. After that, calls ('ccdata' type) and posts ('mbdata' type) are separated into different files.

In [3]:
# Loading the appropriate model
#
# Before doing so, they need to be installed. Choose one (or both, for safe measure) of them:
#  - Bigger, slower, but more accurate: python -m spacy download en_core_web_trf
#  - Small, faster, but less accurate: python -m spacy download en_core_web_sm
nlp = sp.load("en_core_web_trf")

In [6]:
# Loading and preprocessing the data sets.
#
# Time periods:
#  - Period one: 1700-1830
#  - Period two: 1831-2000
#  - Period three: 2001-2131

periods = ("1700-1830", "1831-2000", "2001-2131")
data_frames = [pd.read_csv(f"original_csv/csv-{period}.csv") for period in periods]
stop_words = stopwords.words("english")

for index, data_frame in enumerate(data_frames):
    print(f"[{index + 1}/{len(data_frames)}] Processing data-frames...")
    data_frame_length = data_frame.shape[0]

    for row in range(data_frame_length):
        print(f" |_ [{row + 1}/{data_frame_length}] Processing row...", end="\r")

        message = data_frame.loc[row, "message"]
        document = nlp(message)
        tokens = [
            token.lemma_ for token in document
            if token.text not in stop_words  # Remove stop words
            and token.is_punct is False      # Remove punctuation
        ]

        data_frame.loc[row, "message"] = " ".join(tokens)

    print(f" |_ [{data_frame_length}/{data_frame_length}] Processing completed.")

combined_csv = pd.concat(data_frames)
combined_csv.to_csv("processed/combined_csv.csv", index=False)

[1/3] Processing data-frames...
 |_ [1033/1033] Processing completed.
[2/3] Processing data-frames...
 |_ [1815/1815] Processing completed.
[3/3] Processing data-frames...
 |_ [1215/1215] Processing completed.


In [57]:
# Separate posts and calls (reports)
combined_csv = pd.read_csv("processed/combined_csv.csv")
combined_csv['timestamp'] = combined_csv['date(yyyyMMddHHmmss)'].apply(lambda t : datetime.strptime(str(t),'%Y%m%d%H%M%S').strftime("%Y-%m-%d %H:%M:%S"))
combined_csv = combined_csv.drop(['date(yyyyMMddHHmmss)'], axis='columns')

posts = combined_csv[combined_csv['type'] == 'mbdata']

posts.to_csv("processed/posts.csv", index=False)

reports = combined_csv[combined_csv['type'] == 'ccdata']

reports = reports.drop(['author', 'longitude', 'latitude'], axis='columns')\
    .rename(columns={' location' : 'location'})

reports.to_csv("processed/reports.csv", index=False)

In [174]:
timestamps = pd.DatetimeIndex(reports['timestamp'])
#pd.crosstab(data['time'], data['location'])
timestamps = timestamps.to_frame()
fig = px.line(timestamps.resample("5min").count(), y='timestamp', title='Reports over the evening')

fig.show()

In [173]:
timestamps = pd.DatetimeIndex(posts['timestamp'])
#pd.crosstab(data['time'], data['location'])
timestamps = timestamps.to_frame()
fig = px.line(timestamps.resample("5min").count(), y='timestamp', title='Posts over the evening')

fig.show()

In [165]:
reports['location'].value_counts()

N. Alexandrias St / N. Ithakis St    7
N. Achilleos St / N. Madeg St        7
1422 N. Souliou St                   3
547 N. Schaber Ave                   3
Vissaraki St / Rist Way              2
                                    ..
N. Estos St / N. Barabla St          1
5097 N. Asiant St                    1
436 Zefirou Ave                      1
5367 N. Averof St                    1
4754 N. Valmai St                    1
Name: location, Length: 152, dtype: int64

In [177]:
pd.crosstab(reports['timestamp'], reports['location'])

# Separar em intervalos para verificar os locais que mais chamaram durante a noite

location,N. Polvo St / Egeou Ave,101 Vatopediou Rd,1066 N. Lumen St,1122 N. Maskin St,1218 Ipsilantou Ave,1422 N. Souliou St,1439 N. Utanfor St,1492 N. Lumen St,1615 N. Omirou St,1919 N. Pilau St,...,Niovis St / N. Aveny St,Profitou Ilia St / Egeou Ave,Rist Way / Exadakitiou Way,Rist Way / N. Desafio St,Rist Way / Niovis St,S. Achilleos St / S. Utanfor St,S. Limnou St / S. Eleftherias St,S. Maskin St / S. Eleftherias St,S. Mikonou St / S. Achilleos St,Vissaraki St / Rist Way
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-23 17:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-23 17:02:10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-23 17:05:13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-23 17:11:02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-23 17:11:53,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-01-23 21:21:11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-23 21:22:00,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-23 21:25:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2014-01-23 21:27:12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
