In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import read_csv # Importing the read_csv method 
                            # as it will allow us to write cleaner code
%matplotlib inline
from pylab import *
from natsort import index_natsorted
import seaborn as sns
sns.set_theme(style="ticks")
import sqlite3

# Connect to a database w sqlite3
connection = sqlite3.connect('mydatbase.db')
crsr_for_database = connection.cursor()
print("Connected to 'mydatabase.db'")
        
# Assigning variable to the datasets
# These names will represent the three different 
# data sets used to create this project

single_family_data = read_csv('./datasets/single_family_home.csv')
condo_data = read_csv('./datasets/median_condo_price.csv')
black_and_latino_data = read_csv('./datasets/black_and_latino_mortgage_rates.csv')

# Format floats to show 2 decimal places for '% Change' col
pd.options.display.float_format = '{:.2f}'.format

# Prints a message to show that everything is loaded up
print("All systems GO!")

In [None]:
# Some formatting before we begin

# convert inpouts in the '% change' col to float type
# Removing of the % symbol needs to happen first
single_family_data['% Change'] = single_family_data['% Change'].str[:-1]

# Converts the type to a float
single_family_data['% Change'] = single_family_data['% Change'].astype(float)


# Format floats to show 2 decimal places for '% Change' col
pd.options.display.float_format = '{:.2f}'.format

## Single Family Median Price Data:

In this section we will be combing through the single family median price dataset for the months of Jan-Jun of 2021 and the same months of 2022.

We have no aim as to what we hope to uncover, only merely to present the data in such a way that it makes full sense to the end user. 

In [None]:
single_family_data.dtypes

In [None]:
single_family_data.describe()

In [None]:

pd.unique(single_family_data['Communities Type'])


In [None]:
# Removing NaN values by removing the 'Notes column 
# as those were the only 'NaN' values in the entire data set.
# The other two data sets were complete and did not need to be amended

# Prints off all the columns and the amount of NaN vals for each
# As shown below, There are no more 'NaN' values 
# as we removed the 'Notes'
print(f"Total number of 'NaN' inputs: \n\n{single_family_data.isna().sum()}\n")

# Shows all values which may be 'Null'
print(f"Total number of 'Null' inputs: \n{single_family_data.isnull().sum()}\n")

# Prints all of the coumn names
print(f"\n{single_family_data.columns}")

# Used the below command to remove the notes column as all of the values were 'NaN'
# If you do not run all cells it will trigger an error when re run

del single_family_data['Notes']

In [None]:
# testing the columns post deletion of 'Notes' col
print(single_family_data.columns)

In [None]:
# Presenting data by community type

# 'Developing Suburbs', 
# 'Maturing Suburbs',
# 'Rural Towns'

# Metro Core Communities Single family
metro_data_single = single_family_data.loc[lambda single_family_data: single_family_data['Communities Type'] == "Metro Core Communities"]

# Regional Urban Centers for Single Family homes
reg_urban_centers_single = single_family_data.loc[lambda single_family_data: single_family_data['Communities Type'] == "Regional Urban Centers"]

# Streetcar Suburbs
street_car_suburb_single = single_family_data.loc[lambda single_family_data: single_family_data['Communities Type'] == "Streetcar Suburbs"]

# Developing suburbs
developing_suburb_single = single_family_data.loc[lambda single_family_data: single_family_data['Communities Type'] == "Developing Suburbs"]

# Maturing Suburbs
maturing_suburb_single = single_family_data.loc[lambda single_family_data: single_family_data['Communities Type'] == "'Maturing Suburbs"]

# Rural Towns
rural_town_single = single_family_data.loc[lambda single_family_data: single_family_data['Communities Type'] == "Rural Towns"]


# print(metro_data_single)
# df.loc[lambda df: df['shield'] == 8]

In [None]:
# Sorting by community type column
# to present which type experienced
# the greatest increase in value by % 

# Metro core communities data
metro_data_single.sort_values(by='% Change', ascending=False)

# Regional Urban Centers Single Family
reg_urban_centers_single.sort_values(by='% Change', ascending=False)

# Street Car Urban Suburb Single Family
street_car_suburb_single.sort_values(by='% Change', ascending=False)

# Developing Suburbs Single Family
developing_suburb_single.sort_values(by='% Change', ascending=False)

# Maturing Suburbs Single Family
maturing_suburb_single.sort_values(by='% Change', ascending=False)

# Rural Town Single Family
rural_town_single.sort_values(by='% Change', ascending=False)

In [None]:
# Showing the column names to get a little 
# more insight into the values we will be seeing
def show_column_names():

  # Single family
  print("\nSingle family data:\n")
  for col in single_family_data.columns:
    print(col)

  # Condo
  print("\nCondo data:\n")
  for col in condo_data.columns:
    print(col)

  # Black And Latino mortgages
  print("\nBlack and Latino data:\n")
  for col in black_and_latino_data.columns:
    print(col)

show_column_names()

#### Showing the Head and Tail

In [None]:
# This command will show us the first 5 rows of data
# The data is sorted in no particular order
# Just the order they were entered into the CSV file

single_family_data.head()

In [None]:
# This command will show us the last 5 rows of data
single_family_data.tail()

In [None]:
single_family_start_price = single_family_data.sort_values(by="Median Sale Price, Jan - June 2021", ascending=False)
print(single_family_start_price)
single_family_start_price.head()