Ideas from Kaggle site:
- What areas of the country are most likely to have UFO sightings?
- Are there any trends in UFO sightings over time? Do they tend to be clustered or seasonal?
- Do clusters of UFO sightings correlate with landmarks, such as airports or government research centers?
- What are the most common UFO descriptions?

New idea:
- Add weather, population... about the sight?
- Military base, airport near the sight?
- 

Some new data links:
- https://www.kaggle.com/sogun3/uspollution

In [2]:
%matplotlib inline

import warnings
import pandas as pd
import numpy as np
import seaborn as sns

### Reading data
- location not found or blank (0.8146%) 
- erroneous or blank time (8.0237%)

In [4]:
# There are some rows with an extra comma that gave reading error
# For now we skip them ~ 300 rows
# TODO: parse warning text and fix them
df = pd.read_csv("../data/complete.csv", error_bad_lines=False, warn_bad_lines=False)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


#### Not useful columns

In [5]:
# date posted seem not useful
df.drop(columns=["date posted"], inplace=True)

# this column is the same as duration (seconds)
df.drop(columns=["duration (hours/min)"], inplace=True)

# Save comment to seperate variable for NLP
comments = df.loc[df.comments.notna(), "comments"]
df.drop(columns=["comments"], inplace=True)

#### Casting column types

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88679 entries, 0 to 88678
Data columns (total 8 columns):
datetime              88679 non-null object
city                  88679 non-null object
state                 81270 non-null object
country               76314 non-null object
shape                 85757 non-null object
duration (seconds)    88677 non-null object
latitude              88679 non-null object
longitude             88679 non-null float64
dtypes: float64(1), object(7)
memory usage: 5.4+ MB


In [6]:
# Column has wrong value in latitude
df = df[df.latitude != '33q.200088']
df["latitude"] = df.latitude.astype(float)

In [7]:
df["duration"] = df["duration (seconds)"].str.replace("`", "")
df["duration"] = df["duration"].astype(float)
df.drop(columns=["duration (seconds)"], inplace=True)

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88678 entries, 0 to 88678
Data columns (total 8 columns):
datetime     88678 non-null object
city         88678 non-null object
state        81269 non-null object
country      76314 non-null object
shape        85756 non-null object
latitude     88678 non-null float64
longitude    88678 non-null float64
duration     65534 non-null float64
dtypes: float64(3), object(5)
memory usage: 6.1+ MB


In [163]:
df.head()

Unnamed: 0,datetime,city,state,country,shape,latitude,longitude,duration
0,10/10/1949 20:30,san marcos,tx,us,cylinder,29.883056,-97.941111,2700.0
1,10/10/1949 21:00,lackland afb,tx,,light,29.38421,-98.581082,7200.0
2,10/10/1955 17:00,chester (uk/england),,gb,circle,53.2,-2.916667,20.0
3,10/10/1956 21:00,edna,tx,us,circle,28.978333,-96.645833,20.0
4,10/10/1960 20:00,kaneohe,hi,us,light,21.418056,-157.803611,900.0


#### Fill NAs

In [25]:
# TODO:
# Infer state from city
# Infer country from state
# Infer duration second from duration hour?
copy_df = df

# Country from state (states as a list)
us_states = np.asarray(['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
                        'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me',
                        'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm',
                        'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
                        'ut', 'va', 'vt', 'wa', 'wi', 'wv', 'wy'])
# NaN countries from state
copy_df['country'] = copy_df.apply(lambda row: 'us' if pd.isnull(row['country']) and row['state'] in us_states else row['country'], axis=1)
copy_df.head()



Unnamed: 0,datetime,city,state,country,shape,latitude,longitude,duration
0,10/10/1949 20:30,san marcos,tx,us,cylinder,29.883056,-97.941111,2700.0
1,10/10/1949 21:00,lackland afb,tx,us,light,29.38421,-98.581082,7200.0
2,10/10/1955 17:00,chester (uk/england),,gb,circle,53.2,-2.916667,20.0
3,10/10/1956 21:00,edna,tx,us,circle,28.978333,-96.645833,20.0
4,10/10/1960 20:00,kaneohe,hi,us,light,21.418056,-157.803611,900.0


In [43]:
# Convert datetime column into datetime objects. Time to separate column.
# Some missing or erroneus dates, so using errors='coerce'
copy_df['datetime'] = pd.to_datetime(copy_df['datetime'], errors='coerce')
#copy_df.insert(1, 'time', copy_df['datetime'].dt.time) INSERTED!!!
copy_df.head()

Unnamed: 0,datetime,year,date,time,city,state,country,shape,latitude,longitude,duration
0,1949-10-10 20:30:00,1949.0,1949-10-10,20:30:00,san marcos,tx,us,cylinder,29.883056,-97.941111,2700.0
1,1949-10-10 21:00:00,1949.0,1949-10-10,21:00:00,lackland afb,tx,us,light,29.38421,-98.581082,7200.0
2,1955-10-10 17:00:00,1955.0,1955-10-10,17:00:00,chester (uk/england),,gb,circle,53.2,-2.916667,20.0
3,1956-10-10 21:00:00,1956.0,1956-10-10,21:00:00,edna,tx,us,circle,28.978333,-96.645833,20.0
4,1960-10-10 20:00:00,1960.0,1960-10-10,20:00:00,kaneohe,hi,us,light,21.418056,-157.803611,900.0


In [29]:
# Date to separate column
copy_df.insert(1, 'date', copy_df['datetime'].dt.date)
copy_df.head()

Unnamed: 0,datetime,date,time,city,state,country,shape,latitude,longitude,duration
0,1949-10-10 20:30:00,1949-10-10,20:30:00,san marcos,tx,us,cylinder,29.883056,-97.941111,2700.0
1,1949-10-10 21:00:00,1949-10-10,21:00:00,lackland afb,tx,us,light,29.38421,-98.581082,7200.0
2,1955-10-10 17:00:00,1955-10-10,17:00:00,chester (uk/england),,gb,circle,53.2,-2.916667,20.0
3,1956-10-10 21:00:00,1956-10-10,21:00:00,edna,tx,us,circle,28.978333,-96.645833,20.0
4,1960-10-10 20:00:00,1960-10-10,20:00:00,kaneohe,hi,us,light,21.418056,-157.803611,900.0


In [45]:
# Year to separate column. Have the sightings increased during years??+
#copy_df.insert(1, 'year', copy_df['datetime'].dt.year) INSERTED!!!!
copy_df['year'] = copy_df['year'].fillna(0).astype(int)
copy_df.head()

Unnamed: 0,datetime,year,date,time,city,state,country,shape,latitude,longitude,duration
0,1949-10-10 20:30:00,1949,1949-10-10,20:30:00,san marcos,tx,us,cylinder,29.883056,-97.941111,2700.0
1,1949-10-10 21:00:00,1949,1949-10-10,21:00:00,lackland afb,tx,us,light,29.38421,-98.581082,7200.0
2,1955-10-10 17:00:00,1955,1955-10-10,17:00:00,chester (uk/england),,gb,circle,53.2,-2.916667,20.0
3,1956-10-10 21:00:00,1956,1956-10-10,21:00:00,edna,tx,us,circle,28.978333,-96.645833,20.0
4,1960-10-10 20:00:00,1960,1960-10-10,20:00:00,kaneohe,hi,us,light,21.418056,-157.803611,900.0


In [50]:
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88678 entries, 0 to 88678
Data columns (total 11 columns):
datetime     87458 non-null datetime64[ns]
year         88678 non-null int64
date         87458 non-null object
time         87458 non-null object
city         88678 non-null object
state        81269 non-null object
country      83276 non-null object
shape        85756 non-null object
latitude     88678 non-null float64
longitude    88678 non-null float64
duration     65534 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 8.1+ MB
