In [1]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Format data to get a uniform style

# Data from https://everytownresearch.org/school-shootings/

In [2]:
everytown = pd.read_csv("./data/everytown.csv",sep=";",header=None, parse_dates=[1])
everytown.columns = ["index","date","date_repeaded","state","city","school","type_school","gun_fired","injured","death","attempted_suicide","suicide","number_victims"]
everytown = everytown.loc[:,["date","state","school","type_school","death","suicide","number_victims"]]
everytown.to_csv("./data/everytown_formatted.tsv",index=None,sep="\t")
everytown.head(2)


Unnamed: 0,date,state,school,type_school,death,suicide,number_victims
0,2013-01-08,FL,Apostolic Revival Center Christian School,K-12,1,0,1
1,2013-01-10,CA,Taft Union High School,K-12,0,0,0


In [3]:
everytown_u = pd.read_csv("./data/everytown_updated.csv",sep=";",header=0, parse_dates=["Date"])

everytown_u.columns = ["date","city","school","state","type_school","category","attack","lat","lon"]
everytown_u["death"] = np.NaN
everytown_u["suicide"] = np.NaN
everytown_u["number_victims"] = np.NaN
everytown_u = everytown_u.loc[:,["date","state","school","type_school","death","suicide","number_victims","lat","lon"]]
everytown_u.to_csv("./data/everytown_u_formatted.tsv",index=None,sep="\t")
everytown_u.head(2)


Unnamed: 0,date,state,school,type_school,death,suicide,number_victims,lat,lon
0,2018-02-14,FL,Marjory Stoneman Douglas High School,K-12,,,,26.304503,-80.269457
1,2018-02-08,NY,The Metropolitan High School,K-12,,,,40.827733,-73.897018


In [13]:
#For self-excitation
everytown_u_se = everytown_u[["lat","lon","number_victims","date"]]
everytown_u_se = everytown_u_se.sort_values(by="date")
everytown_u_se["number_victims"] = 1
everytown_u_se["date"] = everytown_u_se["date"].diff().dt.days
everytown_u_se["date"] = np.cumsum(everytown_u_se["date"].replace(np.NaN,0))
everytown_u_se.to_csv("./data/everytown_u_self_excitation.tsv",sep=" ",index=None,header=None)
everytown_u_se.head(2)

Unnamed: 0,lat,lon,number_victims,date
289,26.640191,-81.834505,1,0.0
288,35.146303,-119.460509,1,2.0


# Data from Jim&Alyssa: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5314897/

In [18]:
#NAN value is: stewart country schools (a K-12 school)
d_type2school = {"High": "K-12", "Middle": "K-12", "college": "College", "Elementary": "K-12",np.NaN:"K-12"}

In [19]:
alyssa = pd.read_csv("./data/ShootMiami2.csv",sep=";",header=None,parse_dates=[0])
alyssa.columns = ["date","_1","_2","_3","year","type_shooing","number_victims","school","type_school","city","state","urbanity",
                 "relationship_with_attacker","gender","age","suicide","motives","population_school","lat","lon","_4","_5","_6","_7","_8"]
alyssa = alyssa.loc[:,["date","state","school","type_school","death","suicide","number_victims"]]
alyssa["death"] = (alyssa["number_victims"]>0).astype(int)
#alyssa["suicide"] = alyssa["suicide"].astype(int) 
alyssa["type_school"] = alyssa["type_school"].replace(d_type2school)
alyssa.to_csv("./data/alyssa_formatted.tsv",index=None,sep="\t")
alyssa.head(2)

Unnamed: 0,date,state,school,type_school,death,suicide,number_victims
0,1990-08-27,NV,Eldorado High,K-12,1,0.0,1
1,1991-10-09,NY,James Monroe,K-12,1,0.0,1


In [20]:
alyssa_u = pd.read_csv("./data/ShootMiami2_updated.csv",sep=";",header=None,parse_dates=[0])
alyssa_u.columns = ["date","_1","_2","_3","year","type_shooing","number_victims","school","type_school","city","state","urbanity",
                 "relationship_with_attacker","gender","age","suicide","lat","lon"]
alyssa_u = alyssa_u.loc[:,["date","state","school","type_school","death","suicide","number_victims","lat","lon"]]
alyssa_u["death"] = (alyssa_u["number_victims"]>0).astype(int)
#alyssa["suicide"] = alyssa["suicide"].astype(int) 
alyssa_u["type_school"] = alyssa_u["type_school"].replace(d_type2school)
alyssa_u.to_csv("./data/alyssa_u_formatted.tsv",index=None,sep="\t")
alyssa_u.head(2)

Unnamed: 0,date,state,school,type_school,death,suicide,number_victims,lat,lon
0,1990-08-27,NV,Eldorado High,K-12,1,0.0,1,38.732681,-120.81188
1,1991-10-09,NY,James Monroe,K-12,1,0.0,1,43.146905,-77.598189


In [22]:
#For self-excitation
alyssa_u_se = alyssa_u[["lat","lon","number_victims","date"]]
alyssa_u_se = alyssa_u_se.sort_values(by="date")
alyssa_u_se["date"] = alyssa_u_se["date"].diff().dt.days
alyssa_u_se["date"] = np.cumsum(alyssa_u_se["date"].replace(np.NaN,0))
alyssa_u_se.to_csv("./data/alyssa_u_self_excitation.tsv",sep=" ",index=None,header=None)
alyssa_u_se.tail(2)

Unnamed: 0,lat,lon,number_victims,date
256,36.857377,-88.401604,2,10011.0
257,26.310777,-80.253225,17,10033.0


# Data from USA Today on mass shootings (http://www.gannett-cdn.com/GDContent/mass-killings/index.html)

In [24]:
usatoday = pd.read_csv("./data/USAToday.csv",sep="\t",header=None,parse_dates=[0])
usatoday.columns = ["date","city","state","type_attack","motives","number_victims"]
usatoday = usatoday.loc[usatoday["type_attack"].str.contains("hooting")]
usatoday.to_csv("./data/usatoday_formatted.tsv",index=None,sep="\t")
usatoday.head(2)

Unnamed: 0,date,city,state,type_attack,motives,number_victims
1,2006-01-30,Goleta,Calif.,Shooting,Public Killing,7
2,2006-02-21,Mesa,Ariz.,Shooting,Other,5


In [30]:
usatoday_u = pd.read_csv("./data/usatoday_u_formatted.tsv",sep="\t",header=0, parse_dates=["date"])
usatoday_u.columns = ["date","city","state","type_attack","motives","number_victims","lat","lon"]
usatoday_u.head(2)

Unnamed: 0,date,city,state,type_attack,motives,number_victims,lat,lon
0,2006-01-30,Goleta,Calif.,Shooting,Public Killing,7,34.435829,-119.827639
1,2006-02-21,Mesa,Ariz.,Shooting,Other,5,33.415184,-111.831472


In [31]:
#For self-excitation
usatoday_se = usatoday_u[["lat","lon","number_victims","date"]]
usatoday_se = usatoday_se.sort_values(by="date")
usatoday_se["date"] = usatoday_se["date"].diff().dt.days
usatoday_se["date"] = np.cumsum(usatoday_se["date"].replace(np.NaN,0))
usatoday_se.to_csv("./data/usatoday_self_excitation.tsv",sep=" ",index=None,header=None)
usatoday_se.tail(2)

Unnamed: 0,lat,lon,number_victims,date
211,33.322662,-80.413704,4,3453.0
212,35.04563,-85.30968,5,3454.0


# Format tweet count

In [1]:
!cat "./data/datav2/01.01.10.tgzSS.txt"

3099166
28
10
10
1813


In [41]:

def format_tweets():
    import os
    import pylab as plt
    import numpy as np
    from datetime import datetime
    import pandas as pd

    listFiles = os.listdir("./data/datav2/")
    with open("./data/all_tweets_formatted.tsv","w+") as fout:
        fout.write("date\tshooting\tschool_shooting\tall\tmass_shooting\tmass_murder\n")
    for fileT in listFiles:
        if ".tgz" in fileT:
            name = fileT[:-10]
            name = "20" + name[-2:] +"-"+ name[3:5] + "-" + name[0:2]
        elif ".gz" in fileT:
            name = fileT[:-15]
        else: 
            print(fileT)
            continue
            
        date = datetime(year=int(name[0:4]),month=int(name[5:7]),day=int(name[-2:]))

        
        with open("./data/datav2/"+fileT) as f, open("./data/all_tweets_formatted.tsv","a+") as fout:
            lines = [float(_) for _ in f.readlines()]
            fout.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(name,*lines))
    


In [43]:
format_tweets()

tempFiles.txt
