In [2]:
import pandas as pd
import datetime, time
import numpy as np
from sqlalchemy import create_engine
import sqlite3
import os 

In [3]:
pd.set_option('display.max_columns', 20)

In [4]:
astr_df = pd.read_csv('CSVs/astronauts.csv')
spacewalks_df = pd.read_csv('CSVs/space_walks.csv')
spacemissions_df = pd.read_csv('CSVs/space_missions.csv')
global_launches_df = pd.read_csv('CSVs/global_space_launches.csv')

## Dropping columns

In [5]:
spacemissions_df.drop(labels=['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
global_launches_df.drop(labels=['DateTime','Year','Month','Day','Time'], axis=1, inplace=True)

## Date conversions

In [6]:
date_df = pd.DataFrame({"Global Launches Dates": global_launches_df.Date, "Spacemissions Dates": spacemissions_df.Datum
                       ,"Spacewalks Dates": spacewalks_df.Date})
date_df.head()

Unnamed: 0,Global Launches Dates,Spacemissions Dates,Spacewalks Dates
0,07/08/2020,"Fri Aug 07, 2020 05:12 UTC",06/03/1965
1,06/08/2020,"Thu Aug 06, 2020 04:01 UTC","March 16-17, 1966"
2,04/08/2020,"Tue Aug 04, 2020 23:57 UTC",06/05/1966
3,30/07/2020,"Thu Jul 30, 2020 21:25 UTC",07/19/1966
4,30/07/2020,"Thu Jul 30, 2020 11:50 UTC",07/20/1966


In [7]:
date_df.info()

Global Launches Dates    object
Spacemissions Dates      object
Spacewalks Dates         object
dtype: object

In [8]:
## Splicing string to get rid of time and timezone
spacemissions_df.Datum = spacemissions_df.Datum.apply(lambda x: x[0:16])

In [9]:
# Built-in Pandas datetime function

global_launches_df.Date = pd.to_datetime(global_launches_df.Date, format='%d/%m/%Y', errors = "coerce")

spacemissions_df.Datum = pd.to_datetime(spacemissions_df.Datum, format= '%a %b %d, %Y', errors='coerce')

spacewalks_df.Date = pd.to_datetime(spacewalks_df.Date, format= "%m/%d/%Y", errors="coerce")

In [10]:
# Re-running date_df to verify conversions were successful

In [11]:
date_df = pd.DataFrame({"Global Launches Dates": global_launches_df.Date, "Spacemissions Dates": spacemissions_df.Datum
                       ,"Spacewalks Dates": spacewalks_df.Date})
date_df.head()

Unnamed: 0,Global Launches Dates,Spacemissions Dates,Spacewalks Dates
0,2020-08-07,2020-08-07,1965-06-03
1,2020-08-06,2020-08-06,NaT
2,2020-08-04,2020-08-04,1966-06-05
3,2020-07-30,2020-07-30,1966-07-19
4,2020-07-30,2020-07-30,1966-07-20


In [12]:
date_df.dtypes

Global Launches Dates    datetime64[ns]
Spacemissions Dates      datetime64[ns]
Spacewalks Dates         datetime64[ns]
dtype: object

## Renaming columns for ease of access in pgAdmin

In [13]:
global_launches_df.columns = ['company_name', 'location', 'detail', 'status_rocket', 'rocket',
       'status_mission', 'country_of_launch', 'company_country_origin',
       'private_or_state', 'date']

spacemissions_df.columns = ['company_name', 'location', 'date', 'detail', 'status_rocket',
       'rocket', 'status_mission']

spacewalks_df.columns = ['eva#', 'country', 'crew', 'vehicle', 'date', 'duration', 'purpose']

astr_df.columns = ['name', 'year', 'group', 'status', 'birth_date', 'birth_place',
       'gender', 'alma_mater', 'undergraduate_major', 'graduate_major',
       'military_rank', 'military_branch', 'space_flights',
       'space_flight_hours', 'space_walks', 'space_walks_hours', 'missions',
       'death_date', 'death_mission']

## Creating connection to space_db  and converting dataframes to sql tables
## Postgres:

In [20]:
password = os.environ.get('postgres_password')

In [21]:
engine = create_engine(f"postgresql://postgres:{password}@localhost:5432/space_db")

ModuleNotFoundError: No module named 'psycopg2'

In [22]:
global_launches_df.to_sql("global_launches", engine)

spacemissions_df.to_sql("space_missions", engine)

spacewalks_df.to_sql("spacewalks", engine)

astr_df.to_sql("astronauts", engine)

NameError: name 'engine' is not defined

## SQLite:

In [25]:
sqlite_engine = create_engine('sqlite:///space_db.sqlite')

In [26]:
global_launches_df.to_sql("global_launches", sqlite_engine)

spacemissions_df.to_sql("space_missions", sqlite_engine)

spacewalks_df.to_sql("spacewalks", sqlite_engine)

astr_df.to_sql("astronauts", sqlite_engine)

## Exploration

In [14]:
## Counting number of unconverted dates

In [15]:
global_launches_df.date.isna().sum()

0

In [16]:
spacemissions_df.date.isna().sum()

0

In [17]:
spacewalks_df.date.isna().sum()

44

In [23]:
grad_majors = astr_df.groupby(astr_df.graduate_major)['graduate_major'].count().to_frame()

In [37]:
grad_majors.rename(columns={"graduate_major": "counts"}, inplace=True)

In [40]:
grad_majors.sort_values(by="counts", ascending=False, inplace=True)

In [42]:
grad_majors.head(20)

Unnamed: 0_level_0,counts
graduate_major,Unnamed: 1_level_1
Aeronautical Engineering,27
Aerospace Engineering,21
Medicine,16
Physics,15
Mechanical Engineering,13
Electrical Engineering,8
Aeronautics & Astronautics,7
Aviation Systems,6
Astronomy,6
Engineering Management,5


In [43]:
companies = spacemissions_df.groupby(spacemissions_df.company_name)['company_name'].count().to_frame()
companies.rename(columns={"company_name": "counts"}, inplace=True)

In [45]:
companies.sort_values(by="counts", ascending=False, inplace=True)

In [47]:
companies.head(20)

Unnamed: 0_level_0,counts
company_name,Unnamed: 1_level_1
RVSN USSR,1777
Arianespace,279
General Dynamics,251
CASC,251
NASA,203
VKS RF,201
US Air Force,161
ULA,140
Boeing,136
Martin Marietta,114
