 <!---
 
 TO DO:
* Remove '%' charachter from '% Change' column as it prevents me from converting the dtype of the column to float
* Convert '% Change' column to float\n",
* Clean notebook to have individual datasets in their respective columns

--->

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import read_csv # Importing the read_csv method 
                            # as it will allow us to write cleaner code
%matplotlib inline
from pylab import *
from natsort import index_natsorted
import seaborn as sns
sns.set_theme(style="ticks")
import sqlite3

# Connect to a database w sqlite3
connection = sqlite3.connect('mydatbase.db')
crsr_for_database = connection.cursor()
print("Connected to 'mydatabase.db'")
        
# Assigning variable to the datasets
# These names will represent the three different 
# data sets used to create this project

single_family_data = read_csv('/content/datasets/single_family_home.csv')
condo_data = read_csv('/content/datasets/median_condo_price.csv')
black_and_latino_data = read_csv('/content/datasets/black_and_latino_mortgage_rates.csv')

# Prints a message to show that everything is loaded up
print("All systems GO!")

### Single Family Median Price Data:

In this section we will be combing through the single family median price dataset for the months of Jan-Jun of 2021 and the same months of 2022.

We have no aim as to what we hope to uncover, only merely to present the data in such a way that it makes full sense to the end user. 

In [None]:
price_info_on_single_family = single_family_data['Median Sale Price, Jan - June 2021'].sort_values()
print(price_info_on_single_family)

In [None]:
single_family_data.dtypes

In [None]:
"""
In order to properly order by % change values
I will need to convert the type
of the % change columns from object
to float

This means I will also have to remove the '%' character,
It is pretty redundant as we know that its the % column
"""

In [None]:

pd.unique(single_family_data['Communities Type'])


In [None]:
# Removing NaN values by removing the 'Notes column 
# as those were the only 'NaN' values in the entire data set.
# The other two data sets were complete and did not need to be amended

# Prints off all the columns and the amount of NaN vals for each
# As shown below, There are no more 'NaN' values 
# as we removed the 'Notes'
print(f"Total number of 'NaN' inputs: \n\n{single_family_data.isna().sum()}\n")

# Shows all values which may be 'Null'
print(f"Total number of 'Null' inputs: \n{single_family_data.isnull().sum()}\n")

# Prints all of the coumn names
print(f"\n{single_family_data.columns}")

# Used the below command to remove the notes column as all of the values were 'NaN'
# If you do not run all cells it will trigger an error when re run

del single_family_data['Notes']

In [None]:
# testing the columns post deletion of 'Notes' col
print(single_family_data.columns)

In [None]:
# Need to remove % character from all values in the '% Change column before converting the type to a float

## First, Lets find out what we are dealing with first

Knowing what you are dealing with is a huge part of the battle, if all one sees is numbers and percentages, then it will not make any sense and I would not have done my job properly.

Being able to know what values mean helps the end user and every user in between. It may seem trivial to some but presenting this data in its correct order whill allow the viewer to quickly assess what the number means based on its placement on a: chart/graph/file

In [None]:
# Showing the column names to get a little 
# more insight into the values we will be seeing
def show_column_names():

  # Single family
  print("\nSngle family data:\n")
  for col in single_family_data.columns:
    print(col)

  # Condo
  print("\nCondo data:\n")
  for col in condo_data.columns:
    print(col)

  # Black And Latino mortgages
  print("\nBlack and Latino data:\n")
  for col in black_and_latino_data.columns:
    print(col)

show_column_names()

### Showing the Head and Tail

In [None]:
# This command will show us the first 5 rows of data
# The data is sorted in no particular order
# Just the order they were entered into the CSV file

single_family_data.head()

In [None]:
# This command will show us the last 5 rows of data
single_family_data.tail()

In [None]:
# Getting a head/tail for all 3 datasets
def get_head_and_tail():
  # Single family data
  print("Single family:\n")
  print(single_family_data.head())
  print(single_family_data.tail())
  # Condo data
  print("Condo data: \n")
  print(condo_data.head())
  print(condo_data.tail())
  #Black and Latino data
  print("Black and Latino owner share: \n")
  print(black_and_latino_data.head())
  print(black_and_latino_data.tail())

get_head_and_tail()

## Let's start presenting the data


In [None]:
# The below is just a simple line graph using seaborn to show the data
sns.lineplot(data=single_family_data)

In [None]:
black_and_latino_data.tail()


In [None]:
single_family_start_price = single_family_data.sort_values(by="Median Sale Price, Jan - June 2021", ascending=False)
print(single_family_start_price)
single_family_start_price.head()

In [None]:
# Using the natsort library I am able to naturally sort the values I would like
sorted_single_family_data = single_family_data.sort_values(by=["% Change"], ascending=True, key=lambda x: np.argsort(index_natsorted(single_family_data["% Change"])))
print(sorted_single_family_data)

### What can we do with this data:

The sorting method allows us to sort and present data in a way that will let end users know which towns had the greatest changes (positive or negative) in price for single family homes in specictowns over the same period of time. 

##### What could people do with this information:

The uses are many, once could use this to make an informed decision on where they could potentially purchase a home for the greatest return... or they could be a family who is strapped for cash and looking to purchase a home without thinking about upside/downside in price. 

The uses don't only have to satisfy one group. The data is the data is the data,so the overall use of it, in my opinion, is impartial. 

In [None]:
community_type_single_family = single_family_data.sort_values(by=["Communities Type", 
                                                                  "% Change"], ascending=False, 
                                                              key=lambda x: np.argsort(index_natsorted(single_family_data["% Change"])))
community_type_single_family.head()


### Making sense of the single family data


In [None]:
boston_single_family_data = single_family_data.loc[single_family_data['Municipality'] == "Boston"]
print(boston_single_family_data)

In [None]:
boston_condo_data = condo_data.loc[condo_data['Town'] == "Boston"]

print(boston_condo_data)

In [None]:
boston_black_and_latino = black_and_latino_data.loc[black_and_latino_data['Municipality'] == "Boston"]
print(boston_black_and_latino)

In [None]:
def all_boston_data():
  boston_single_family_data = single_family_data.loc[single_family_data['Municipality'] == "Boston"]
  boston_condo_data = condo_data.loc[condo_data['Town'] == "Boston"]
  boston_black_and_latino = black_and_latino_data.loc[black_and_latino_data['Municipality'] == "Boston"]

  print(boston_single_family_data)
  print(boston_condo_data)
  print(boston_black_and_latino)

all_boston_data()


In [None]:
# Replplot of the head values of the condo data
head_condo_data = condo_data.head()
sns.relplot(x="Town", y="Condo.PercChange", data=head_condo_data)


In [None]:
# Sorting Condo Data by % Change
# Will need to drop % symbol from string
# Because it is read as an error

condo_data.sort_values(by=['Condo.PercChange'], ascending=False)