Merge pull request #913 from atalyaalon/dev

Add new accidents around schools script and schools table
data-for-change · Jul 26, 2018 · 91db07a · 91db07a
2 parents aabfa32 + b82a9c2
commit 91db07a
Show file tree

Hide file tree

Showing 12 changed files with 3,667 additions and 9 deletions.
diff --git a/alembic/versions/5ac16eaf11a_schools_table.py b/alembic/versions/5ac16eaf11a_schools_table.py
@@ -0,0 +1,54 @@
+"""schools_table
+
+Revision ID: 5ac16eaf11a
+Revises: 3680a8998648
+Create Date: 2018-07-21 18:40:32.562699
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '5ac16eaf11a'
+down_revision = '3680a8998648'
+branch_labels = None
+depends_on = None
+
+from alembic import op
+import sqlalchemy as sa
+import geoalchemy2
+
+
+def upgrade():
+    ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('schools',
+    sa.Column('id', sa.BigInteger(), nullable=False),
+    sa.Column('fcode_type', sa.Integer(), nullable=True),
+    sa.Column('yishuv_symbol', sa.Integer(), nullable=True),
+    sa.Column('yishuv_name', sa.Text(), nullable=True),
+    sa.Column('school_name', sa.Text(), nullable=True),
+    sa.Column('school_latin_name', sa.Text(), nullable=True),
+    sa.Column('usg', sa.Integer(), nullable=True),
+    sa.Column('usg_code', sa.Integer(), nullable=True),
+    sa.Column('e_ord', sa.Float(), nullable=True),
+    sa.Column('n_ord', sa.Float(), nullable=True),
+    sa.Column('longitude', sa.Float(), nullable=True),
+    sa.Column('latitude', sa.Float(), nullable=True),
+    sa.Column('geom', geoalchemy2.types.Geometry(geometry_type='POINT', srid=4326), nullable=True),
+    sa.Column('data_year', sa.Integer(), nullable=True),
+    sa.Column('prdct_ver', sa.DateTime(), nullable=True),
+    sa.Column('x', sa.Float(), nullable=True),
+    sa.Column('y', sa.Float(), nullable=True),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_schools_geom'), 'schools', ['geom'], unique=False)
+    op.create_index(op.f('ix_schools_id'), 'schools', ['id'], unique=False)
+    op.create_index(op.f('ix_schools_yishuv_symbol'), 'schools', ['yishuv_symbol'], unique=False)
+    ### end Alembic commands ###
+
+
+def downgrade():
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_schools_yishuv_symbol'), table_name='schools')
+    op.drop_index(op.f('ix_schools_id'), table_name='schools')
+    op.drop_index(op.f('ix_schools_geom'), table_name='schools')
+    op.drop_table('schools')
+    ### end Alembic commands ###
diff --git a/anyway/accidents_around_schools.py b/anyway/accidents_around_schools.py
@@ -0,0 +1,129 @@
+import sqlalchemy as sa
+from sqlalchemy.orm.query import Query, aliased
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import desc, or_, join, select
+from flask_sqlalchemy import SQLAlchemy
+import argparse
+import io
+import math
+import requests
+from datetime import datetime
+from utilities import init_flask
+from models import AccidentMarker, Involved, School
+from constants import CONST
+import pandas as pd
+import os
+from time import strftime
+from datetime import datetime
+
+SUBTYPE_ACCIDENT_WITH_PEDESTRIAN = 1
+LOCATION_ACCURACY_PRECISE = True
+LOCATION_ACCURACY_PRECISE_INT = 1
+INJURED_TYPE_PEDESTRIAN = 1
+YISHUV_SYMBOL_NOT_EXIST = -1
+CONTENT_ENCODING = 'utf-8'
+ANYWAY_UI_FORMAT = "https://www.anyway.co.il/?zoom=17&start_date={start_date}&end_date={end_date}&lat={latitude}&lon={longitude}&show_fatal=1&show_severe=1&show_light=1&approx={location_approx}&accurate={location_accurate}&show_markers=1&show_discussions=&show_urban=3&show_intersection=3&show_lane=3&show_day=7&show_holiday=0&show_time=24&start_time=25&end_time=25&weather=0&road=0&separation=0&surface=0&acctype={acc_type}&controlmeasure=0&district=0&case_type=0"
+DATE_INPUT_FORMAT = '%d-%m-%Y'
+DATE_URL_FORMAT = '%Y-%m-%d'
+
+
+app = init_flask()
+db = SQLAlchemy(app)
+
+def get_bounding_box(latitude, longitude, distance_in_km):
+
+    latitude = math.radians(latitude)
+    longitude = math.radians(longitude)
+
+    radius = 6371
+    # Radius of the parallel at given latitude
+    parallel_radius = radius*math.cos(latitude)
+
+    lat_min = latitude - distance_in_km/radius
+    lat_max = latitude + distance_in_km/radius
+    lon_min = longitude - distance_in_km/parallel_radius
+    lon_max = longitude + distance_in_km/parallel_radius
+    rad2deg = math.degrees
+
+    return rad2deg(lat_min), rad2deg(lon_min), rad2deg(lat_max), rad2deg(lon_max)
+
+def acc_inv_query(longitude, latitude, distance, start_date, end_date, school):
+    lat_min, lon_min, lat_max, lon_max = get_bounding_box(latitude, longitude, distance)
+    baseX = lon_min;
+    baseY = lat_min;
+    distanceX = lon_max;
+    distanceY = lat_max;
+    pol_str = 'POLYGON(({0} {1},{0} {3},{2} {3},{2} {1},{0} {1}))'.format(baseX,
+                                                                          baseY,
+                                                                          distanceX,
+                                                                          distanceY)
+
+    query_obj = db.session.query(Involved, AccidentMarker) \
+        .join(AccidentMarker, AccidentMarker.provider_and_id == Involved.provider_and_id) \
+        .filter(AccidentMarker.geom.intersects(pol_str)) \
+        .filter(Involved.injured_type == INJURED_TYPE_PEDESTRIAN) \
+        .filter(AccidentMarker.provider_and_id == Involved.provider_and_id) \
+        .filter(or_((AccidentMarker.provider_code == CONST.CBS_ACCIDENT_TYPE_1_CODE), (AccidentMarker.provider_code == CONST.CBS_ACCIDENT_TYPE_3_CODE))) \
+        .filter(AccidentMarker.created >= start_date) \
+        .filter(AccidentMarker.created < end_date) \
+        .filter(AccidentMarker.locationAccuracy == LOCATION_ACCURACY_PRECISE_INT) \
+        .filter(AccidentMarker.yishuv_symbol != YISHUV_SYMBOL_NOT_EXIST)
+
+    df = pd.read_sql_query(query_obj.with_labels().statement, query_obj.session.bind)
+
+    if LOCATION_ACCURACY_PRECISE:
+        location_accurate = 1
+        location_approx = 0
+    else:
+        location_accurate = 1
+        location_approx = 1
+    ui_url = ANYWAY_UI_FORMAT.format(latitude=school['latitude'],
+                                     longitude=school['longitude'],
+                                     start_date=start_date.strftime(DATE_URL_FORMAT),
+                                     end_date=end_date.strftime(DATE_URL_FORMAT),
+                                     acc_type=SUBTYPE_ACCIDENT_WITH_PEDESTRIAN,
+                                     location_accurate=location_accurate,
+                                     location_approx=location_approx)
+    df['anyway_link'] = ui_url
+    df['school_id'] = school['id']
+    df['school_name'] = school['school_name']
+    df['school_yishuv_symbol'] = school['yishuv_symbol']
+    df['school_yishuv_name'] = school['yishuv_name']
+    df['school_longitude'] = school['longitude']
+    df['school_latitude'] = school['latitude']
+    return df
+
+
+def main(start_date, end_date, distance, output_path):
+    schools_query = sa.select([School])
+    df_schools = pd.read_sql_query(schools_query, db.session.bind)
+    df_total = pd.DataFrame()
+    for idx, school in df_schools.iterrows():
+        df_total = pd.concat([df_total,
+                             acc_inv_query(longitude=school['longitude'],
+                                           latitude=school['latitude'],
+                                           distance=distance,
+                                           start_date=start_date,
+                                           end_date=end_date,
+                                           school=school)],
+                             axis=0)
+    df_total.to_csv(os.path.join(output_path,'df_total.csv'), encoding=CONTENT_ENCODING)
+
+    df_total_involved_count = (df_total.groupby(['school_id', 'school_name', 'anyway_link', 'school_longitude', 'school_latitude', 'school_yishuv_symbol', 'school_yishuv_name'])
+                                       .size()
+                                       .reset_index(name='injured_count')
+                                       .sort_values('injured_count', ascending=False))
+    df_total_involved_count.to_csv(os.path.join(output_path,'df_total_involved_count.csv'), encoding=CONTENT_ENCODING, header=True)
+
+    df_total_involved_by_injury = (df_total.groupby(['school_id', 'school_name', 'anyway_link', 'school_longitude', 'school_latitude', 'school_yishuv_symbol', 'school_yishuv_name','involved_injury_severity'])
+                                           .size()
+                                           .reset_index(name='injured_count')
+                                           .sort_values('injured_count', ascending=False))
+    df_total_involved_by_injury.to_csv(os.path.join(output_path,'df_total_involved_by_injury.csv'), encoding=CONTENT_ENCODING, header=True)
+
+    df_total_accident_count = (df_total.drop_duplicates(['school_id', 'school_name', 'anyway_link', 'school_longitude', 'school_latitude', 'school_yishuv_symbol', 'school_yishuv_name','provider_and_id'])
+                                        .groupby(['school_id', 'school_name', 'school_yishuv_symbol', 'school_yishuv_name', 'markers_severity'])
+                                        .size()
+                                        .reset_index(name='accidents_count')
+                                        .sort_values('accidents_count', ascending=False))
+    df_total_accident_count.to_csv(os.path.join(output_path,'df_total_accident_count.csv'), encoding=CONTENT_ENCODING, header=True)
diff --git a/anyway/models.py b/anyway/models.py
@@ -878,3 +878,24 @@ class VehicleNoLocation(Base):
                       Index('accident_id_idx_vehicles_no_location', 'accident_id'),
                       Index('provider_and_id_idx_vehicles_no_location', 'provider_and_id', unique=False),
                       {})
+
+
+class School(Base):
+    __tablename__ = "schools"
+    id = Column(BigInteger(), primary_key=True, index=True)
+    fcode_type = Column(Integer(), nullable=True)
+    yishuv_symbol = Column(Integer(), nullable=True, index=True)
+    yishuv_name = Column(Text(), nullable=True)
+    school_name = Column(Text(), nullable=True)
+    school_latin_name = Column(Text(), nullable=True)
+    usg = Column(Integer(), nullable=True)
+    usg_code = Column(Integer(), nullable=True)
+    e_ord = Column(Float(), nullable=True)
+    n_ord = Column(Float(), nullable=True)
+    longitude = Column(Float(), nullable=True)
+    latitude = Column(Float(), nullable=True)
+    geom = Column(Geometry('POINT', srid=4326), index=True)
+    data_year = Column(Integer(), nullable=True)
+    prdct_ver = Column(DateTime, default=None)
+    x = Column(Float(), nullable=True)
+    y = Column(Float(), nullable=True)
diff --git a/anyway/parsers/cbs.py b/anyway/parsers/cbs.py
@@ -15,7 +15,7 @@
 from ..models import AccidentMarker, Involved, Vehicle, AccidentsNoLocation, InvolvedNoLocation, VehicleNoLocation
 from .. import models
 from ..constants import CONST
-from ..utilities import ItmToWGS84, init_flask, CsvReader, time_delta, decode_hebrew,ImporterUI,truncate_tables
+from ..utilities import ItmToWGS84, init_flask, CsvReader, time_delta, decode_hebrew,ImporterUI,truncate_tables,chunks
 from functools import partial
 import logging
 
@@ -384,13 +384,6 @@ def get_files(directory):
         elif name in (ACCIDENTS, INVOLVED, VEHICLES):
             yield name, csv
 
-def chunks(l, n):
-    """Yield successive n-sized chunks from l."""
-    try: xrange
-    except NameError:
-        xrange = range
-    for i in xrange(0, len(l), n):
-        yield l[i:i + n]
 
 
 def import_to_datastore(directory, provider_code, batch_size):

diff --git a/anyway/parsers/schools.py b/anyway/parsers/schools.py
@@ -0,0 +1,73 @@
+from .. import school_fields
+import geoalchemy2.functions as func
+import logging
+from datetime import datetime
+from ..utilities import init_flask, CsvReader, time_delta, chunks
+from flask_sqlalchemy import SQLAlchemy
+from ..models import School
+import pandas as pd
+
+app = init_flask()
+db = SQLAlchemy(app)
+
+def get_data_value(value):
+    """
+    :returns: value for parameters which are not mandatory in an accident data
+    OR -1 if the parameter value does not exist
+    """
+    return int(value) if value else -1
+
+def get_schools(filepath):
+    logging.info("\tReading schools data from '%s'..." % filepath)
+    schools = []
+    df = pd.read_csv(filepath)
+    for idx, school in df.iterrows():
+        longitude, latitude = float(school[school_fields.longitude]),float(school[school_fields.latitude]),
+        point_str = 'SRID=4326;POINT({0} {1})'.format(longitude, latitude)
+        school = {
+            "id": int(school[school_fields.id]),
+            "fcode_type": int(school[school_fields.fcode_type]),
+            "yishuv_symbol": int(school[school_fields.yishuv_symbol]),
+            "yishuv_name": school[school_fields.yishuv_name],
+            "school_name": school[school_fields.school_name],
+            "school_latin_name ": school[school_fields.school_latin_name],
+            "usg": int(school[school_fields.usg]),
+            "usg_code": int(school[school_fields.usg_code]),
+            "e_ord": float(school[school_fields.e_ord]),
+            "n_ord": float(school[school_fields.n_ord]),
+            "longitude": longitude,
+            "latitude": latitude,
+            "geom": point_str,
+            "data_year": get_data_value(school[school_fields.data_year]),
+            "prdct_ver": None,
+            "x": float(school[school_fields.x]),
+            "y": float(school[school_fields.y]),
+        }
+        schools.append(school)
+
+    return schools
+
+def import_to_datastore(filepath, batch_size):
+    try:
+        assert batch_size > 0
+        started = datetime.now()
+        schools = get_schools(filepath)
+        new_items = 0
+        all_existing_schools_ids = set(map(lambda x: x[0],
+                                             db.session.query(School.id).all()))
+        schools = [school for school in schools if school['id'] not in all_existing_schools_ids]
+        logging.info('inserting ' + str(len(schools)) + ' new schools')
+        for schools_chunk in chunks(schools, batch_size):
+            db.session.bulk_insert_mappings(School, schools_chunk)
+            db.session.commit()
+        new_items += len(schools)
+        logging.info("\t{0} items in {1}".format(new_items, time_delta(started)))
+        return new_items
+    except ValueError as e:
+        failed_dirs[directory] = str(e)
+        return 0
+
+def parse(filepath, batch_size):
+    started = datetime.now()
+    total = import_to_datastore(filepath, batch_size)
+    logging.info("Total: {0} schools in {1}".format(total, time_delta(started)))
diff --git a/anyway/school_fields.py b/anyway/school_fields.py
@@ -0,0 +1,16 @@
+id = "UNIQ_ID"
+fcode_type = "FCODE_TYPE"
+yishuv_symbol = "SETL_CODE"
+yishuv_name = "SETL_NAME"
+school_name = "NAME"
+school_latin_name = "LATIN_NAME"
+usg = "USG_GROUP"
+usg_code = "USG_CODE"
+e_ord = "E_ORD"
+n_ord = "N_ORD"
+longitude = "LON"
+latitude = "LAT"
+data_year = "DATA_YEAR"
+prdct_ver = "PRDCT_VER"
+x = "X"
+y = "Y"
diff --git a/scripts/accidents_around_location.py → anyway/scripts/accidents_around_location.py b/scripts/accidents_around_location.py → anyway/scripts/accidents_around_location.py
diff --git a/scripts/schools.csv → anyway/scripts/schools.csv b/scripts/schools.csv → anyway/scripts/schools.csv
diff --git a/scripts/validate_cbs_data.py → anyway/scripts/validate_cbs_data.py b/scripts/validate_cbs_data.py → anyway/scripts/validate_cbs_data.py
diff --git a/anyway/utilities.py b/anyway/utilities.py
@@ -163,3 +163,12 @@ def is_delete_all(self):
             if confirm_delete_all.lower() == 'n':
                 self._delete_all = False
         return self._delete_all
+
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    try: xrange
+    except NameError:
+        xrange = range
+    for i in xrange(0, len(l), n):
+        yield l[i:i + n]
diff --git a/main.py b/main.py
@@ -80,9 +80,16 @@ def united(light, username, password, lastmail):
 @click.argument("filename")
 def rsa(filename):
     from anyway.parsers.rsa import parse
-
     return parse(filename)
 
+@process.command()
+@click.argument("filepath")
+@click.option('--batch_size', type=int, default=5000)
+def schools(filepath, batch_size):
+    from anyway.parsers.schools import parse
+    return parse(filepath=filepath,
+                 batch_size=batch_size)
+
 
 @cli.command()
 @click.argument('identifiers', nargs=-1)
@@ -117,6 +124,32 @@ def load_discussions(identifiers):
             db.session.rollback()
             logging.warn("Failed: " + identifier + ": " + e)
 
+@cli.group()
+def scripts():
+    pass
+
+def valid_date(date_string):
+    DATE_INPUT_FORMAT = '%d-%m-%Y'
+    from datetime import datetime
+    try:
+        return datetime.strptime(date_string, DATE_INPUT_FORMAT)
+    except ValueError:
+        msg = "Not a valid date: '{0}'.".format(date_string)
+        raise argparse.ArgumentTypeError(msg)
+
+
+@scripts.command()
+@click.option('--start_date', default='01-01-2013', type=valid_date, help='The Start Date - format DD-MM-YYYY')
+@click.option('--end_date', default='31-12-2017', type=valid_date, help='The End Date - format DD-MM-YYYY')
+@click.option('--distance', default=0.5, help='float In KM. Default is 0.5 (500m)', type=float)
+@click.option('--output_path', default='output', help='output file of the results. Default is output.csv')
+def accidents_around_schools(start_date, end_date, distance, output_path):
+    from anyway.accidents_around_schools import main
+    return main(start_date=start_date,
+                 end_date=end_date,
+                 distance=distance,
+                 output_path=output_path)
+
 
 if __name__ == '__main__':
     cli(sys.argv[1:]) # pylint: disable=too-many-function-args