# **Unidentified Flying Analytics:**
---
### Project Overview

From the description: What areas of the country are most likely to have UFO sightings? Are there any trends in UFO sightings over time? Do they tend to be clustered or seasonal? Do clusters of UFO sightings correlate with landmarks, such as airports or government research centers? What are the most common UFO descriptions?

---
### Datasets

https://www.kaggle.com/NUFORC/ufo-sightings

### Import dependencies

In [2]:
!pip install plotly



In [1]:
import pandas as pd
import requests
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import *
import zipfile
import geopandas as gpd
import glob
import plotly.express as px

### Get UFO sightings file

In [19]:
file_path = 'C:\\Users\\12039\\Documents\\UFO_sightings\\original_scrubbed.csv'
ufo_data = pd.read_csv(file_path)
ufo_data.head()

  ufo_data = pd.read_csv(file_path)


Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [20]:
ufo_data.dtypes

datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object

### Clean UFO sightings file

In [21]:
# Identify rows with problematic datetime values
problematic_dates = ufo_data['datetime'].loc[ufo_data['datetime'].str.contains('24:00')]
# Display rows with problematic datetime values after replacement
print(problematic_dates)

388      10/11/2006 24:00
693       10/1/2001 24:00
962       10/1/2012 24:00
1067     10/12/2003 24:00
1221     10/12/2013 24:00
               ...       
79136      9/4/2005 24:00
79137      9/4/2005 24:00
79740      9/7/2002 24:00
79759      9/7/2004 24:00
79942      9/8/2003 24:00
Name: datetime, Length: 694, dtype: object


In [22]:
# Replace the problematic values with an appropriate hour
ufo_data['datetime'] = ufo_data['datetime'].str.replace('24:00', '00:00')
# Convert the column to datetime
ufo_data['datetime'] = pd.to_datetime(ufo_data['datetime'], errors='coerce')

In [23]:
ufo_data['date posted'] = pd.to_datetime(ufo_data['date posted'])

In [24]:
ufo_data['duration (seconds)'] = pd.to_numeric(ufo_data['duration (seconds)'], errors='coerce')

In [25]:
ufo_data['duration(minutes)'] = ufo_data['duration (seconds)']/60

In [26]:
ufo_data['latitude'] = pd.to_numeric(ufo_data['latitude'], errors='coerce')

In [27]:
ufo_data.dtypes

datetime                datetime64[ns]
city                            object
state                           object
country                         object
shape                           object
duration (seconds)             float64
duration (hours/min)            object
comments                        object
date posted             datetime64[ns]
latitude                       float64
longitude                      float64
duration(minutes)              float64
dtype: object

In [28]:
ufo_data['shape'].unique()

array(['cylinder', 'light', 'circle', 'sphere', 'disk', 'fireball',
       'unknown', 'oval', 'other', 'cigar', 'rectangle', 'chevron',
       'triangle', 'formation', nan, 'delta', 'changing', 'egg',
       'diamond', 'flash', 'teardrop', 'cone', 'cross', 'pyramid',
       'round', 'crescent', 'flare', 'hexagon', 'dome', 'changed'],
      dtype=object)

In [12]:
ufo_data['country'].unique()

array(['us', nan, 'gb', 'ca', 'au', 'de'], dtype=object)

In [13]:
ufo_data['city'].unique()

array(['san marcos', 'lackland afb', 'chester (uk/england)', ...,
       'calmar (canada)', 'aleksandrow (poland)', 'hamstead (hollyridge)'],
      dtype=object)

In [29]:
ufo_data['state'].unique()

array(['tx', nan, 'hi', 'tn', 'ct', 'al', 'fl', 'ca', 'nc', 'ny', 'ky',
       'mi', 'ma', 'ks', 'sc', 'wa', 'ab', 'co', 'nh', 'wi', 'me', 'ga',
       'pa', 'il', 'ar', 'on', 'mo', 'oh', 'in', 'az', 'mn', 'nv', 'nf',
       'ne', 'or', 'bc', 'ia', 'va', 'id', 'nm', 'nj', 'mb', 'wv', 'ok',
       'ri', 'nb', 'vt', 'la', 'pr', 'ak', 'ms', 'ut', 'md', 'mt', 'sk',
       'wy', 'sd', 'pq', 'ns', 'qc', 'de', 'nd', 'dc', 'nt', 'sa', 'yt',
       'yk', 'pe'], dtype=object)

In [33]:
ufo_data['state'] = ufo_data['state'].str.upper()

In [None]:
ufo_data.to_csv('ufo_data_cleaned.csv', index=False)

In [34]:
ufo_data_cleaned = pd.read_csv('https://raw.githubusercontent.com/kflemming30/UFO_sightings/main/ufo_data_cleaned.csv')

In [44]:
ufo_data_cleaned['state'].unique()

array(['TX', nan, 'HI', 'TN', 'CT', 'AL', 'FL', 'CA', 'NC', 'NY', 'KY',
       'MI', 'MA', 'KS', 'SC', 'WA', 'AB', 'CO', 'NH', 'WI', 'ME', 'GA',
       'PA', 'IL', 'AR', 'ON', 'MO', 'OH', 'IN', 'AZ', 'MN', 'NV', 'NF',
       'NE', 'OR', 'BC', 'IA', 'VA', 'ID', 'NM', 'NJ', 'MB', 'WV', 'OK',
       'RI', 'NB', 'VT', 'LA', 'PR', 'AK', 'MS', 'UT', 'MD', 'MT', 'SK',
       'WY', 'SD', 'PQ', 'NS', 'QC', 'DE', 'ND', 'DC', 'NT', 'SA', 'YT',
       'YK', 'PE'], dtype=object)

## Pull in other datasets

### Pull in population by state

In [38]:
population_by_state = pd.read_csv("C:\\Users\\12039\\Documents\\UFO_sightings\\population_by_state.csv")
population_by_state.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,POPESTIMATE2014,POPEST18PLUS2014,PCNT_POPEST18PLUS
0,10,0,0,0,United States,318857056,245273438,76.9
1,40,3,6,1,Alabama,4849377,3741806,77.2
2,40,4,9,2,Alaska,736732,550189,74.7
3,40,4,8,4,Arizona,6731484,5109792,75.9
4,40,3,7,5,Arkansas,2966369,2259350,76.2


In [40]:
population_by_state = population_by_state[population_by_state['NAME'] != 'United States']
population_by_state.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,POPESTIMATE2014,POPEST18PLUS2014,PCNT_POPEST18PLUS
1,40,3,6,1,Alabama,4849377,3741806,77.2
2,40,4,9,2,Alaska,736732,550189,74.7
3,40,4,8,4,Arizona,6731484,5109792,75.9
4,40,3,7,5,Arkansas,2966369,2259350,76.2
5,40,4,9,6,California,38802500,29649348,76.4


In [41]:
population_by_state = population_by_state[['NAME', 'POPESTIMATE2014']]
population_by_state = population_by_state.rename(columns={'NAME': 'STATE'})
population_by_state.head()

Unnamed: 0,STATE,POPESTIMATE2014
1,Alabama,4849377
2,Alaska,736732
3,Arizona,6731484
4,Arkansas,2966369
5,California,38802500


In [45]:
population_by_state['STATE'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming',
       'Puerto Rico Commonwealth'], dtype=object)

### Where are sightings most likely?

In [50]:
us_sightings = ufo_data_cleaned[ufo_data_cleaned['country'] == 'us']
state_counts = us_sightings.groupby('state').size().reset_index(name='sightings_count').sort_values(by='sightings_count', ascending = False)
print(state_counts)

   state  sightings_count
4     CA             8912
48    WA             3966
9     FL             3835
44    TX             3447
34    NY             2980
14    IL             2499
3     AZ             2414
38    PA             2366
35    OH             2275
22    MI             1836
37    OR             1747
27    NC             1740
24    MO             1458
5     CO             1413
15    IN             1288
46    VA             1273
19    MA             1256
31    NJ             1255
10    GA             1255
49    WI             1232
43    TN             1119
23    MN             1012
41    SC             1003
6     CT              892
17    KY              855
20    MD              837
33    NV              803
36    OK              724
32    NM              720
12    IA              678
1     AL              642
45    UT              622
16    KS              613
2     AR              588
21    ME              558
18    LA              558
13    ID              521
30    NH    

### Create mapping dictionary for state name and abbreviation

In [60]:
state_abbr_mapping = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming'
}


### Add state population to us_sightings

In [62]:
# Map state abbreviations to state names using the state_abbr_mapping
state_counts['STATE'] = state_counts['state'].map(state_abbr_mapping)

# Merge the us_sightings DataFrame with population_by_state based on the 'STATE' column
merged_df = pd.merge(state_counts, population_by_state[['STATE', 'POPESTIMATE2014']], on='STATE', how='left')

# Display the merged DataFrame
merged_df = merged_df.rename(columns={'STATE': 'state_long', 'POPESTIMATE2014':'pop_2014'})
merged_df['sightings_per_10000'] = (merged_df['sightings_count'] / merged_df['pop_2014']) * 10000
merged_df.sort_values(by='sightings_per_10000', ascending=False)

Unnamed: 0,state,sightings_count,state_long,pop_2014,sightings_per_10000
1,WA,3966,Washington,7061530.0,5.616347
38,MT,478,Montana,1023579.0,4.669889
10,OR,1747,Oregon,3970239.0,4.400239
42,AK,319,Alaska,736732.0,4.329933
34,ME,558,Maine,1330089.0,4.195208
44,VT,260,Vermont,626562.0,4.149629
37,NH,486,New Hampshire,1326813.0,3.662913
6,AZ,2414,Arizona,6731484.0,3.586133
28,NM,720,New Mexico,2085572.0,3.45229
36,ID,521,Idaho,1634464.0,3.187589
