##### Importing required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import datetime
import altair as alt

##### Reading in the data with only the required columns

In [2]:

URL = 'https://raw.githubusercontent.com/UIUC-iSchool-DataViz/is445_bcubcg_fall2022/main/data/ufo-scrubbed-geocoded-time-standardized-00.csv'
ufo_sightings = ufos = pd.read_csv(URL, names = ["date", "city", "state", "country",
                                                                   "shape", "duration_seconds", "duration",
                                                                   "comment", "report_date", 
                                                                   "latitude", "longitude"],
                                   usecols = ["date", "state", "country", "duration_seconds"],
                                   parse_dates = ["date"])
ufo_sightings.head(3)

Unnamed: 0,date,state,country,duration_seconds
0,1949-10-10 20:30:00,tx,us,2700.0
1,1949-10-10 21:00:00,tx,,7200.0
2,1955-10-10 17:00:00,,gb,20.0


In [3]:
ufo_sightings.country.unique()

array(['us', nan, 'gb', 'ca', 'au', 'de'], dtype=object)

##### Subsetting UFO sigtings data for United States

In [4]:
ufo_usa_sightings = ufo_sightings[ufo_sightings.country == 'us']
ufo_usa_sightings.shape

(65114, 4)

### Quality check and Cleanup 
    of the subsetted data

In [5]:
ufo_usa_sightings.state.unique()

array(['tx', 'hi', 'tn', 'ct', 'al', 'fl', 'ca', 'nc', 'ny', 'ky', 'mi',
       'ma', 'ks', 'sc', 'wa', 'co', 'nh', 'wi', 'me', 'ga', 'pa', 'il',
       'ar', 'mo', 'oh', 'in', 'az', 'mn', 'nv', 'ne', 'or', 'ia', 'va',
       'id', 'nm', 'nj', 'wv', 'ok', 'ri', 'vt', 'la', 'pr', 'ak', 'ms',
       'ut', 'md', 'mt', 'wy', 'sd', 'de', 'nd', 'dc'], dtype=object)

In [6]:
ufo_usa_sightings.loc[:,'state_code'] = ufo_usa_sightings.loc[:,'state'].str.upper()
ufo_usa_sightings.drop(columns = 'state', inplace = True)
ufo_usa_sightings.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,date,country,duration_seconds,state_code
0,1949-10-10 20:30:00,us,2700.0,TX
3,1956-10-10 21:00:00,us,20.0,TX
4,1960-10-10 20:00:00,us,900.0,HI


#####  Removing outlier data based on ufo sightings duration in seconds

In [7]:
 ufo_usa_sightings.duration_seconds.quantile(0.98)/3600

2.0

In [8]:
ufo_usa_sightings = ufo_usa_sightings[ufo_usa_sightings.duration_seconds/3600 <=2]
ufo_usa_sightings = ufo_usa_sightings[ufo_usa_sightings.date.dt.year >= 1950]

In [9]:
ufo_usa_sightings['decade'] = (ufo_usa_sightings['date'].dt.year // 10)*10
ufo_usa_sightings['decade'] = ufo_usa_sightings['decade'].apply(str) + 's'
ufo_usa_sightings.head(3)

Unnamed: 0,date,country,duration_seconds,state_code,decade
3,1956-10-10 21:00:00,us,20.0,TX,1950s
4,1960-10-10 20:00:00,us,900.0,HI,1960s
5,1961-10-10 19:00:00,us,300.0,TN,1960s


##### Summary of the duration the UFO was sighted for

In [10]:
ufo_usa_sightings['duration_seconds'].describe()

count    63819.000000
mean       602.257520
std       1162.713139
min          0.010000
25%         30.000000
50%        180.000000
75%        600.000000
max       7200.000000
Name: duration_seconds, dtype: float64

*There has been around 64,000 UFO sightings since the first one. On an avergae, a UFO sighting lasts for about 10 minutes*

### Data Prep for the Altair Plot

In [11]:
ufo_usa_sightings['date_new'] = ufo_usa_sightings['date'].apply(lambda x: datetime.datetime(year = x.year, 
                                                                                            month = x.month,
                                                                                            day = 1))
ufo_usa_sightings.head(3)

Unnamed: 0,date,country,duration_seconds,state_code,decade,date_new
3,1956-10-10 21:00:00,us,20.0,TX,1950s,1956-10-01
4,1960-10-10 20:00:00,us,900.0,HI,1960s,1960-10-01
5,1961-10-10 19:00:00,us,300.0,TN,1960s,1961-10-01


In [12]:
ufo_usa_sightings_count = ufo_usa_sightings.groupby(['decade','date_new'], as_index = False)['date'].count()
ufo_usa_sightings_count.rename(columns={'date':'n_sightings'}, inplace=True)

In [13]:
ufo_usa_sightings_count

Unnamed: 0,decade,date_new,n_sightings
0,1950s,1950-01-01,1
1,1950s,1950-06-01,12
2,1950s,1950-07-01,4
3,1950s,1950-08-01,2
4,1950s,1950-10-01,1
...,...,...,...
718,2010s,2014-01-01,562
719,2010s,2014-02-01,425
720,2010s,2014-03-01,372
721,2010s,2014-04-01,476


### Altair Plot

In [14]:
click = alt.selection_multi(encodings=['color'])

bar = alt.Chart(ufo_usa_sightings_count).mark_bar().encode(
    alt.X("decade", axis=alt.Axis(title='Decade')),
    alt.Y("sum(n_sightings):Q", axis=alt.Axis(title='# Sightings')),
    color=alt.condition(click, 'decade', alt.value('lightgray'))
).properties(
    width=300
).add_selection(
    click
)



lines = alt.Chart(ufo_usa_sightings_count).mark_line().encode(
    alt.X("date_new:T",axis=alt.Axis(title='Date')),
    alt.Y("sum(n_sightings):Q", axis=alt.Axis(title='# Sightings')),
    alt.Color("count()")
).properties(
    width=300
).transform_filter(
    click
)



In [15]:
final_vis = bar.properties(height = 250, width=300) | lines.properties(height = 250, width=300)

In [16]:
final_vis

##### Saving the visualization as a json specifications file.

In [17]:
final_vis.save('ufo_vis.json')