In [2]:
import pandas as pd

In [3]:
# Load data
df = pd.read_csv("data/ufo_sighting_data.csv")
df.shape

  df = pd.read_csv("data/ufo_sighting_data.csv")


(80332, 11)

In [33]:
from prettytable import PrettyTable

def pretty_table_missing_counts(df, cardinality_limit=10, display=True):
  '''
  Method takes 3 parameters (1 required and 2 optional) and creates a visually pleasing table using the PrettyTable package
  that displays all columns, datatypes, non-null/null counts, and whether cardinality is high
  Params:
    ► df (DataFrame) | Pandas DataFrame
    ► cardinality_limit (int) | Integer representing the limit for considering a column to have high cardinality. Default = 10 items
    ► display (bool) | True/False whether to print the table before returning. Default = True
  Return:
    ► PrettyTable table
  '''

  table = PrettyTable()

  table.field_names = [
      'Column Name', 'Data Type', 'Non-Null Count', 'Missing Count', 'Unique', 
      'High Cardinality'
  ]

  for column in df.columns:
      data_type = str(df[column].dtype)
      non_null_count = df[column].count()
      missing_count = df.shape[0] - non_null_count
      uniques = df[column].nunique()
      cardinality = uniques > cardinality_limit
      table.add_row([column, data_type, non_null_count, missing_count, uniques, cardinality])
  
  if display:
    print(table)
    
  return table

a = pretty_table_missing_counts(df, display=True)

+---------------------------------+-----------+----------------+---------------+--------+------------------+
|           Column Name           | Data Type | Non-Null Count | Missing Count | Unique | High Cardinality |
+---------------------------------+-----------+----------------+---------------+--------+------------------+
|            Date_time            |   object  |     80328      |       0       | 69582  |       True       |
|               city              |   object  |     80328      |       0       | 19898  |       True       |
|          state/province         |   object  |     74532      |      5796     |   67   |       True       |
|             country             |   object  |     70660      |      9668     |   5    |      False       |
|            UFO_shape            |   object  |     78398      |      1930     |   29   |       True       |
|   length_of_encounter_seconds   |  float64  |     80328      |       0       |  533   |       True       |
| described_duratio

In [26]:
df = df.drop(df[pd.to_numeric(df['latitude'], errors='coerce').isna()].index).reset_index(drop=True)
df = df.drop(df[pd.to_numeric(df['longitude'], errors='coerce').isna()].index).reset_index(drop=True)

In [23]:
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

In [32]:
df = df.drop(df[pd.to_numeric(df['length_of_encounter_seconds'], errors='coerce').isna()].index).reset_index(drop=True)
df['length_of_encounter_seconds'] = df['length_of_encounter_seconds'].astype(float)

In [48]:
df.iloc[387:389]

Unnamed: 0,Date_time,city,state/province,country,UFO_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
387,10/11/2006 23:00,pampa,tx,us,triangle,60.0,1 minute,Two triangular objects&#44 lit up by three mai...,10/30/2006,35.536111,-100.959444
388,10/11/2006 24:00,rome,ny,us,oval,120.0,a min or two,I was walking from the garage to the house&#44...,2/1/2007,43.212778,-75.456111


In [49]:
# Replace "24:00" with "00:00"
df['Date_time'] = df['Date_time'].str.replace('24:00', '00:00')
df['Date_time'] = pd.to_datetime(df['Date_time'], format='%m/%d/%Y %H:%M')

In [51]:
df['Years'] = df['Date_time'].dt.year

In [54]:
df['Years'].max()

2014