# Projeto 1 de LPAA - Análise e Exploração de Dados - LEANDRO DANTAS LIMA (059.323.894-00)

In [2]:
# importando bibliotecas que serão utilizadas
import numpy as np
import pandas as pd
from pandas import DataFrame

In [9]:
# desativar mensagens de warnings
pd.options.mode.chained_assignment = None

In [10]:
# importando o banco de dados UFO Sightings para análise
df = pd.read_csv("scrubbed.csv", sep=",", on_bad_lines='skip', low_memory=False)

In [11]:
# criando uma cópia do dataframe para manter o backup do original
df_copy = df.copy(deep=True)  # deep=True (padrão) o novo objeto será criado com uma cópia dos dados e índices do objeto original, sem alterações no original.

In [12]:
# mostrando as 5 primeiras linhas para entender os dados
df_copy.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [13]:
# mostrando as propriedades do df
df_copy.shape

(80332, 11)

In [14]:
# mostrando as colunas do df
df_copy.columns

Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude '],
      dtype='object')

In [15]:
# mostrando os tipos de dados --> quando não consegue definir, classifica como object
df_copy.dtypes

datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object

In [16]:
# mostrando os índices
df_copy.index

RangeIndex(start=0, stop=80332, step=1)

In [17]:
# resumo estatístico do df --> apenas colunas com dados numéricos
df_copy.describe()

Unnamed: 0,longitude
count,80332.0
mean,-86.772885
std,39.697205
min,-176.658056
25%,-112.073333
50%,-87.903611
75%,-78.755
max,178.4419


In [18]:
# conferindo e contando se há valores ausentes no df
df_copy.isna().sum()

datetime                   0
city                       0
state                   5797
country                 9670
shape                   1932
duration (seconds)         0
duration (hours/min)       0
comments                  15
date posted                0
latitude                   0
longitude                  0
dtype: int64

In [19]:
# limpando dados ausentes
df_copy_clean = df_copy.dropna()

In [20]:
# conferindo se ainda há valores ausentes
df_copy_clean.isna().sum()

datetime                0
city                    0
state                   0
country                 0
shape                   0
duration (seconds)      0
duration (hours/min)    0
comments                0
date posted             0
latitude                0
longitude               0
dtype: int64

In [21]:
# avaliando os tipos de dados por coluna
df_copy_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66516 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              66516 non-null  object 
 1   city                  66516 non-null  object 
 2   state                 66516 non-null  object 
 3   country               66516 non-null  object 
 4   shape                 66516 non-null  object 
 5   duration (seconds)    66516 non-null  object 
 6   duration (hours/min)  66516 non-null  object 
 7   comments              66516 non-null  object 
 8   date posted           66516 non-null  object 
 9   latitude              66516 non-null  object 
 10  longitude             66516 non-null  float64
dtypes: float64(1), object(10)
memory usage: 6.1+ MB


In [22]:
# corrigindo dados não numéricos para estatísticas
col_num = ['duration (seconds)', 'latitude', 'longitude ']

def to_type(DataFrame, columns, type):
    for col in columns:
        DataFrame[col] = DataFrame[col].astype(type)
        
to_type(df_copy_clean, col_num, 'float')

In [23]:
# reconferindo tipo de dados na coluna latitude
df_copy_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66516 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              66516 non-null  object 
 1   city                  66516 non-null  object 
 2   state                 66516 non-null  object 
 3   country               66516 non-null  object 
 4   shape                 66516 non-null  object 
 5   duration (seconds)    66516 non-null  float64
 6   duration (hours/min)  66516 non-null  object 
 7   comments              66516 non-null  object 
 8   date posted           66516 non-null  object 
 9   latitude              66516 non-null  float64
 10  longitude             66516 non-null  float64
dtypes: float64(3), object(8)
memory usage: 6.1+ MB


In [24]:
# filtrando dados para localização com data, local e duração
coord = ['datetime',  'latitude', 'longitude ', 'duration (seconds)', 'city', 'state', 'country', 'shape']
local = df_copy_clean[coord]

In [26]:
# exportando/salvando coordenadas para uma planilha excel
local.to_excel(r'coordenadas.xlsx')

In [53]:
# contando elementos da coluna country
df_copy_clean.country.value_counts()

country
us    63553
ca     2942
gb       11
au       10
Name: count, dtype: int64

In [54]:
# contando elementos da coluna state
df_copy_clean.state.value_counts()

state
ca    8683
fl    3754
wa    3709
tx    3398
ny    2915
      ... 
nf      15
pe      10
dc       8
yt       7
yk       2
Name: count, Length: 67, dtype: int64

In [57]:
# contando elementos da coluna city
df_copy_clean.city.value_counts()

city
seattle                           471
phoenix                           438
las vegas                         356
portland                          354
los angeles                       347
                                 ... 
egg harbor city                     1
garrettsville                       1
king of prussia (5 miles from)      1
moosup                              1
calmar (canada)                     1
Name: count, Length: 11920, dtype: int64

In [62]:
# contando elementos da coluna shape
df_copy_clean['shape'].value_counts()

shape
light        14130
triangle      6817
circle        6405
fireball      5364
unknown       4774
other         4705
sphere        4552
disk          4319
oval          3160
formation     2088
cigar         1717
changing      1653
flash         1124
rectangle     1117
cylinder      1079
diamond        969
chevron        852
teardrop       614
egg            609
cone           257
cross          197
delta            7
round            2
pyramid          1
flare            1
hexagon          1
crescent         1
changed          1
Name: count, dtype: int64

In [75]:
# separando apenas ano em uma coluna
ano = df_copy_clean['datetime'].str.split('/').str[2].str.split(' ').str[0]

In [76]:
ano

0        1949
3        1956
4        1960
5        1961
7        1965
         ... 
80327    2013
80328    2013
80329    2013
80330    2013
80331    2013
Name: datetime, Length: 66516, dtype: object

In [77]:
# contando elementos da coluna ano
ano.value_counts()

datetime
2012    6489
2013    6237
2011    4456
2008    4089
2009    3743
        ... 
1943       1
1941       1
1920       1
1925       1
1934       1
Name: count, Length: 83, dtype: int64