FIRST SECTION: CATEGORIZING CITIES AS HOT, MEDIUM, OR COLD

In [64]:
# Key for changing state names to their abbreviations
us_state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA',
    'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM',
    'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
    'Puerto Rico': 'PR', 'Virgin Islands': 'VI', 'District of Columbia':'DI', 'New Brunswick': 'NB',
    'Guam': 'GU'
}

# FIRST SECTION: CLASSIFYING TEMPERTATURES

In [None]:
import pandas as pd

# Read data
tdf = pd.read_csv("backend/data/citytemperatures.csv")

# Rename CITY to City
tdf = tdf.rename(columns={"CITY": "City"})

# Convert month columns (2 to 13) to numeric
tdf.iloc[:, 2:14] = tdf.iloc[:, 2:14].apply(pd.to_numeric, errors="coerce")

# Calculate the average temperature across months
tdf["Temperature"] = tdf.iloc[:, 2:14].mean(axis=1)

# Compute the average temperature across multiple instances of the same city if there are multiple inputs (multiple rows for each city)
tdf_avg_temp = tdf.groupby('City', as_index=False)['Temperature'].mean()

# Drop duplicates
tdf = tdf.drop_duplicates(subset=['City'], keep='first')

# Calculate quantiles for splitting data into 3 parts
quantiles = tdf["Temperature"].quantile([0.33, 0.66])

# Function to classify temperature based on quantiles
def classify_temp(temp):
    if temp >= quantiles[0.66]:
        return "Hot"
    elif temp >= quantiles[0.33]:
        return "Medium"
    else:
        return "Cold"

# Apply classification
tdf["Category"] = tdf["Temperature"].apply(classify_temp)

# Select only 'City' and 'Category' columns for final (tdf final = tdff)
tdff = tdf[["City", "Category"]]

# Rename Category to Temperature
tdff = tdff.rename(columns={"Category": "Temperature"})

tdff

Unnamed: 0,City,Temperature
0,"BIRMINGHAM,AL",Hot
1,"HUNTSVILLE,AL",Medium
2,"MOBILE,AL",Hot
3,"MONTGOMERY,AL",Hot
4,"ANCHORAGE,AK",Cold
...,...,...
259,"POHNPEI- CAROLINE IS.,PC",Hot
260,"CHUUK- E. CAROLINE IS.,PC",Hot
261,"YAP- W CAROLINE IS.,PC",Hot
262,"SAN JUAN,PR",Hot


# SECOND SECTION: COMBINING THIS WITH HOUSING DATA

In [None]:
# Read first houying data
hdf = pd.read_csv("backend/data/housingdata1.csv")

# Drop rows where 'City' or 'State' is NaN
hdf = hdf.dropna(subset=['City', 'State'])

# revised housing dataframe
hdf = hdf[["State", "City", "Bedroom", "Bathroom", "Area", "LotArea", "MarketEstimate", "RentEstimate", "Price"]]

# Read second housing dataset
hdf2 = pd.read_csv("backend/data/housingdata2.csv")

# Simplify second dataset
hdf2 = hdf2[["state", "city", "bed", "bath", "house_size", "acre_lot", "price"]]

# Rename columns in second dataset to match the first one
hdf2.columns = ['State', 'City', 'Bedroom', 'Bathroom', 'Area', 'LotArea', 'Price']

# Drop rows where 'City' or 'State' is NaN in hd2
hdf2 = hdf2.dropna(subset=['City', 'State'])

# Rename states to their abbreviations
hdf2['State'] = hdf2['State'].map(us_state_abbrev).fillna(hdf2['State'])

# Filter out all non-abbreviated places
hdf2 = hdf2[hdf2['State'].str.match(r'^[A-Z]{2}$', na=False)]

# Combining the two hdfs
hdf = pd.concat([hdf, hdf2], ignore_index=True)

# reformat housing data city to CITY,SI  (SI = STATE INITIAL)
hdf["City"] = hdf["City"] + "," + hdf["State"]

# Ensure key column (city) is of the same type and normalize it so they are all in the same format
hdf.loc[:, "City"] = hdf["City"].str.strip().str.lower()
tdff.loc[:, "City"] = tdff["City"].str.strip().str.lower()

# Merge datasets on 'city'
merged_df = hdf.merge(tdff[["City", "Temperature"]], on="City", how="left")

merged_df

Unnamed: 0,State,City,Bedroom,Bathroom,Area,LotArea,MarketEstimate,RentEstimate,Price,Temperature
0,AL,"saraland,al",4.0,2.0,1614.0,0.38050,240600.0,1599.0,239900.0,
1,AL,"southside,al",3.0,2.0,1474.0,0.67034,186700.0,1381.0,1.0,
2,AL,"robertsdale,al",3.0,2.0,1800.0,3.20000,,,259900.0,
3,AL,"gulf shores,al",2.0,2.0,1250.0,,,,342500.0,
4,AL,"chelsea,al",3.0,3.0,2224.0,0.26000,336200.0,1932.0,335000.0,
...,...,...,...,...,...,...,...,...,...,...
2249097,WA,"richland,wa",4.0,2.0,3600.0,0.33000,,,359900.0,
2249098,WA,"richland,wa",3.0,2.0,1616.0,0.10000,,,350000.0,
2249099,WA,"richland,wa",6.0,3.0,3200.0,0.50000,,,440000.0,
2249100,WA,"richland,wa",2.0,1.0,933.0,0.09000,,,179900.0,


# SECTION 2.5: IMPORTING IN OTHER STATEWISE TEMPERATURE DATA TO REPLACE NAN VALUES

In [67]:
# Read state temperature data
stdf = pd.read_csv("averagestatetemperatures.csv")[["state", "average_temp"]]

# Rename state to State
stdf = stdf.rename(columns={"state": "State"})

# Convert full state names to abbreviations
stdf['State'] = stdf['State'].map(us_state_abbrev)

# Compute the average temperature for each state if there are multiple rows with the same state
stdf_avg_temp = stdf.groupby('State', as_index=False)['average_temp'].mean()

# Drop duplicates
stdf = stdf.drop_duplicates(subset=['State'], keep='first')

# Calculate quantiles for splitting data into 3 parts
quantiles = stdf["average_temp"].quantile([0.33, 0.66])

# Function to classify temperature based on quantiles
def classify_temp(temp):
    if temp >= quantiles[0.66]:
        return "Hot"
    elif temp >= quantiles[0.33]:
        return "Medium"
    else:
        return "Cold"

# Apply classification
stdf["Temperature"] = stdf["average_temp"].apply(classify_temp)

# Remove avg temp
stdf = stdf.drop(columns=['average_temp'])

# Manually input values for states which were not in the state dataset
stdf.loc[len(stdf)] = ['ak', 'Cold']
stdf.loc[len(stdf)] = ['pr', 'Hot']
stdf.loc[len(stdf)] = ['di', 'Medium']
stdf.loc[len(stdf)] = ['vi', 'Hot']
stdf.loc[len(stdf)] = ['nb', 'Cold']
stdf.loc[len(stdf)] = ['hi', 'Hot']
stdf.loc[len(stdf)] = ['gu', 'Hot']

# Ensure data in State column is all in the same format
stdf.loc[:, "State"] = stdf["State"].str.strip().str.lower()
merged_df.loc[:, "State"] = merged_df["State"].str.strip().str.lower()

# Merge state temps into merged df, keeping BOTH temperature columns rather than combining state temps into the merged_df temps. This is because we only want to update
# merged_df temps if there was a NaN entry there 
merged_df = merged_df.merge(stdf, on='State', how='left', suffixes=('_old', '_new'))

# Fill NaN values in the original Temperature column with values from the second dataset
merged_df['Temperature_old'] = merged_df['Temperature_old'].fillna(merged_df['Temperature_new'])

merged_df = merged_df.drop(columns=['Temperature_new'])

merged_df = merged_df.rename(columns={"Temperature_old": "Temperature"})

merged_df

Unnamed: 0,State,City,Bedroom,Bathroom,Area,LotArea,MarketEstimate,RentEstimate,Price,Temperature
0,al,"saraland,al",4.0,2.0,1614.0,0.38050,240600.0,1599.0,239900.0,Hot
1,al,"southside,al",3.0,2.0,1474.0,0.67034,186700.0,1381.0,1.0,Hot
2,al,"robertsdale,al",3.0,2.0,1800.0,3.20000,,,259900.0,Hot
3,al,"gulf shores,al",2.0,2.0,1250.0,,,,342500.0,Hot
4,al,"chelsea,al",3.0,3.0,2224.0,0.26000,336200.0,1932.0,335000.0,Hot
...,...,...,...,...,...,...,...,...,...,...
2249097,wa,"richland,wa",4.0,2.0,3600.0,0.33000,,,359900.0,Cold
2249098,wa,"richland,wa",3.0,2.0,1616.0,0.10000,,,350000.0,Cold
2249099,wa,"richland,wa",6.0,3.0,3200.0,0.50000,,,440000.0,Cold
2249100,wa,"richland,wa",2.0,1.0,933.0,0.09000,,,179900.0,Cold


# THIRD SECTION: DOING THE SAME THING WITH QOL RATING

In [None]:
# Read qol data
qoldf = pd.read_csv("backend/data/qolcitydata.csv", encoding="ISO-8859-1")

# Exract only the columns we want
qoldf = qoldf[["LCITY", "LSTATE", "2016 Crime Rate", "Unemployment", "AQI%Good", "WaterQualityVPV", "%CvgCityPark", "Cost of Living", "2022 Median Income", "AVG C2I", "Diversity Rank (Race)", "Diversity Rank (Gender)"]]

# Correct format of qoldf cities to match that of our merged_df
qoldf.loc[:, "LCITY"] = qoldf["LCITY"].str.strip().str.lower()
qoldf.loc[:, "LSTATE"] = qoldf["LSTATE"].str.strip().str.lower()
qoldf["LCITY"] = qoldf["LCITY"] + "," + qoldf["LSTATE"]

# Drop rows where 'City' or 'State' is NaN
qoldf = qoldf.dropna(subset=['LCITY', 'LSTATE'])

# Drop duplicate entries of a city
qoldf = qoldf.drop_duplicates(subset="LCITY", keep="first")

# Rename LCITY to City for merging
qoldf = qoldf.rename(columns={"LCITY": "City"})

merged_df = merged_df.merge(qoldf[["City", "2016 Crime Rate", "Unemployment", "AQI%Good", "WaterQualityVPV", "%CvgCityPark", "Cost of Living", "2022 Median Income", "AVG C2I", "Diversity Rank (Race)", "Diversity Rank (Gender)"]], on="City", how="left")

merged_df

  qoldf = pd.read_csv("qolcitydata.csv", encoding="ISO-8859-1")


Unnamed: 0,State,City,Bedroom,Bathroom,Area,LotArea,MarketEstimate,RentEstimate,Price,Temperature,2016 Crime Rate,Unemployment,AQI%Good,WaterQualityVPV,%CvgCityPark,Cost of Living,2022 Median Income,AVG C2I,Diversity Rank (Race),Diversity Rank (Gender)
0,al,"saraland,al",4.0,2.0,1614.0,0.38050,240600.0,1599.0,239900.0,Hot,47/1000,3.35%,80.94%,1.0,-1,"$71,947.38","$62,409.46",115.28%,26459.0,63210.0
1,al,"southside,al",3.0,2.0,1474.0,0.67034,186700.0,1381.0,1.0,Hot,43/1000,3.12%,80.94%,0.0,-1,"$67,812.73","$58,943.92",115.05%,69642.0,79134.0
2,al,"robertsdale,al",3.0,2.0,1800.0,3.20000,,,259900.0,Hot,18/1000,2.41%,80.94%,1.0,-1,"$79,155.41","$77,884.76",101.63%,29479.0,36363.0
3,al,"gulf shores,al",2.0,2.0,1250.0,,,,342500.0,Hot,18/1000,2.41%,80.94%,1.0,-1,"$79,155.41","$77,884.76",101.63%,56013.0,31948.0
4,al,"chelsea,al",3.0,3.0,2224.0,0.26000,336200.0,1932.0,335000.0,Hot,16/1000,1.87%,80.94%,-1.0,-1,"$85,691.03","$98,419.23",87.07%,44179.0,41526.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249097,wa,"richland,wa",4.0,2.0,3600.0,0.33000,,,359900.0,Cold,23/1000,5.31%,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0
2249098,wa,"richland,wa",3.0,2.0,1616.0,0.10000,,,350000.0,Cold,23/1000,5.31%,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0
2249099,wa,"richland,wa",6.0,3.0,3200.0,0.50000,,,440000.0,Cold,23/1000,5.31%,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0
2249100,wa,"richland,wa",2.0,1.0,933.0,0.09000,,,179900.0,Cold,23/1000,5.31%,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0


# FOURTH SECTION: FINALLY, ADD IN MEAN INCOME FOR EACH CITY

In [69]:
# Read data
idf = pd.read_csv("meancityincome.csv", encoding="ISO-8859-1")[["State_ab", "City", "Mean", "Median"]]

# Rename columns
idf = idf.rename(columns={"State_ab": "State"})
idf = idf.rename(columns={"Mean": "MeanIncome"})
idf = idf.rename(columns={"Median": "MedianIncome"})

# Correct format of idf cities to match that of our merged_df
idf.loc[:, "City"] = idf["City"].str.strip().str.lower()
idf.loc[:, "State"] = idf["State"].str.strip().str.lower()
idf["City"] = idf["City"] + "," + idf["State"]

# Average out the mean and median entries for cases where one city has multiple entries
idf = idf.groupby('City')[['MeanIncome', 'MedianIncome']].mean().round().reset_index()
idf = idf.drop_duplicates(subset='City')

# Merge with merged_df
merged_df = merged_df.merge(idf[["City", "MeanIncome", "MedianIncome"]], on="City", how="left")

nan_count = merged_df['Cost of Living'].isna().sum()
print(f"{nan_count}")

merged_df

306795


Unnamed: 0,State,City,Bedroom,Bathroom,Area,LotArea,MarketEstimate,RentEstimate,Price,Temperature,...,AQI%Good,WaterQualityVPV,%CvgCityPark,Cost of Living,2022 Median Income,AVG C2I,Diversity Rank (Race),Diversity Rank (Gender),MeanIncome,MedianIncome
0,al,"saraland,al",4.0,2.0,1614.0,0.38050,240600.0,1599.0,239900.0,Hot,...,80.94%,1.0,-1,"$71,947.38","$62,409.46",115.28%,26459.0,63210.0,43803.0,300000.0
1,al,"southside,al",3.0,2.0,1474.0,0.67034,186700.0,1381.0,1.0,Hot,...,80.94%,0.0,-1,"$67,812.73","$58,943.92",115.05%,69642.0,79134.0,,
2,al,"robertsdale,al",3.0,2.0,1800.0,3.20000,,,259900.0,Hot,...,80.94%,1.0,-1,"$79,155.41","$77,884.76",101.63%,29479.0,36363.0,67084.0,172640.0
3,al,"gulf shores,al",2.0,2.0,1250.0,,,,342500.0,Hot,...,80.94%,1.0,-1,"$79,155.41","$77,884.76",101.63%,56013.0,31948.0,65583.0,300000.0
4,al,"chelsea,al",3.0,3.0,2224.0,0.26000,336200.0,1932.0,335000.0,Hot,...,80.94%,-1.0,-1,"$85,691.03","$98,419.23",87.07%,44179.0,41526.0,78399.0,71839.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249097,wa,"richland,wa",4.0,2.0,3600.0,0.33000,,,359900.0,Cold,...,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0,115261.0,235998.0
2249098,wa,"richland,wa",3.0,2.0,1616.0,0.10000,,,350000.0,Cold,...,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0,115261.0,235998.0
2249099,wa,"richland,wa",6.0,3.0,3200.0,0.50000,,,440000.0,Cold,...,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0,115261.0,235998.0
2249100,wa,"richland,wa",2.0,1.0,933.0,0.09000,,,179900.0,Cold,...,92.89%,0.0,-1,"$72,571.13","$83,393.68",87.02%,35964.0,87680.0,115261.0,235998.0


# SECTION 5: UPLOADING TO SQL

In [None]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
from supabase import create_client, Client
import os

# Load environment variables
load_dotenv()

# Retrieve database credentials from .env file
DB_URL = os.getenv("DATABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
SUPABASE_URL = os.getenv("SUPABASE_URL")
DB_PASS = os.getenv("DATABASE_PASS")

# Initialize Supabase client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Fetch data from a table (replace 'your_table_name' with your actual table)
# response = supabase.table("Housing Data").select("*").execute()

# Convert to pandas DataFrame
# df = pd.DataFrame(response.data)

# print(df.head())

# Create SQLAlchemy engine
engine = create_engine(DB_URL, pool_size=5, max_overflow=10)

try:
    with engine.connect() as conn:
        print("Connected to the database successfully!")
except Exception as e:
    print(f"Error: {e}")


try:
    merged_df.to_sql('Housing Data', engine, if_exists='replace', index=False)
    print("Data uploaded successfully!")
except Exception as e:
    print(f"Error uploading data: {e}")

# Upload DataFrame to SQL table
# merged_df.to_sql("Housing Data", engine, if_exists="replace", index=False)

print("DataFrame successfully uploaded to the SQL database.")

Connected to the database successfully!
