In [146]:
import math
import statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [141]:
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import certifi

def get_mongo_dataframe():
    try:
        client = pymongo.MongoClient(
            "mongodb+srv://beepmrw:Beepm@beepm1.21uirez.mongodb.net/test?retryWrites=true&w=majority&ssl=true", 
                    tlsCAFile=certifi.where())
        client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")
    except Exception as e:
        print(e)
    
    connected_db = client.beepm_data
    m_data = connected_db["ll84"]
    columns = {'Primary Property Type - Self Selected', 'Occupancy', 'Number of Buildings',
               'Self-Reported Gross Floor Area (ft²)', 'Total GHG Emissions (Metric Tons CO2e)', 'Electricity Use'}
    cursor = m_data.find({}, columns)
    list_cur = list(cursor)
    odf = pd.DataFrame(list_cur)
    odf.drop('_id', axis=1, inplace=True)
    return odf

In [115]:
def get_building_types(dataframe, minimum):
    """
    gets building types from dataframe with at least
    minimum number of instances.
    type of building assumed to be at column [1] of dataframe
    """
    types, types2 = dict(), dict()
    for index in range(len(dataframe)):
        if dataframe.iloc[index][1] in types:
            types[dataframe.iloc[index][1]] += 1
        else:
            types[dataframe.iloc[index][1]] = 1
    for item in types:
        if types[item] > minimum:
            types2[item] = types[item]   
    return types2

In [139]:
def clean_dataframe(dataframe, column):
    """
    column is var to be predicted
    all np.nan are removed
    creates multiple exponential regression for provided data
    """
    # getting rid of all rows containing np.nan - useless for training
    for column in dataframe.columns:
        dataframe = dataframe[dataframe[column].notna()]

    # prevent division by zero in calculations
    dataframe = dataframe[dataframe[column] > 0]

    # separate X (df) and Y (GHG)
    y = np.array(dataframe.pop(column)).reshape(-1, 1)
    x = np.array(dataframe)

    # linear regression
    regression = LinearRegression().fit(x, [math.log(val) for val in y])
    return dataframe, regression, y

In [164]:
def predict_data(dataframe, regression, percent_outlier, y):
    # gathering information
    expected, actual = [], []
    errors, outliers = [], []

    # making predictions for each non-np.nan entry
    for index in range(len(dataframe)):
        prediction = np.array(dataframe.iloc[index]).reshape(1, -1)

        predEm = regression.predict(prediction)[0]
        predEm = math.e ** predEm

        error = abs((y[index][0] - predEm) / y[index][0] * 100)

        expected.append(predEm)
        actual.append(y[index][0])

        # outliers... unless?
        if error < percent_outlier:
            errors.append(error)

#             print('PREDICTED GHG EMISSIONS:', predEm)
#             print('ACTUAL GHG EMISSIONS:', y[index][0])
#             print('ABS PERCENT ERROR:', error, '%')
#             print()

        else:
            outliers.append(error)
    
#     print('Average error is', round(sum(errors) / len(errors)), '%')
#     print(len(outliers), 'outliers;', round(len(outliers) / len(dataframe) * 100, 2), '% of total entries')
#     print('Outlier error ranges from', round(min(outliers)), '% to', round(max(outliers)), '%')
    
    return dataframe, expected, actual

In [41]:
def display_graphs(dataframe, expected, actual):
    plt.scatter(dataframe['Occupancy'], expected, label='expected', marker='s')
    plt.scatter(dataframe['Occupancy'], actual, label='actual', marker='o')
    plt.xlim(0, 100)
    plt.ylim(0, 500000)
    plt.legend(loc='upper left')
    plt.show()
        
    plt.scatter(dataframe['Number of Buildings'], expected, label='expected', marker='s')
    plt.scatter(dataframe['Number of Buildings'], actual, label='actual', marker='o')
    plt.xlim(0, 100)
    plt.ylim(0, 150000)
    plt.legend(loc='upper left')
    plt.show()
    plt.scatter(dataframe['Self-Reported Gross Floor Area (ft²)'], expected, label='expected', marker='s')
    plt.scatter(dataframe['Self-Reported Gross Floor Area (ft²)'], actual, label='actual', marker='o')
    plt.xlim(0, 4000000)
    plt.ylim(0, 100000)
    plt.legend(loc='upper left')
    plt.show()

In [207]:
def find_corresponding_letter(val):
    if val < 1:
        return 'F'
    elif val < 1.33:
        return 'D-'
    elif val < 1.66:
        return 'D'
    elif val < 2:
        return 'D+'
    elif val < 2.33:
        return 'C-'
    elif val < 2.66:
        return 'C'
    elif val < 3:
        return 'C+'
    elif val < 3.33:
        return 'B-'
    elif val < 3.66:
        return 'B'
    elif val < 4:
        return 'B+'
    elif val < 4.33:
        return 'A'
    else:
        return 'A+'
    

def find_letter_grade(pred_score, avg_score, deviation):
    ret = 2
    rcount, lcount = 0, 0
    increment = 1
    if pred_score > avg_score:
        while pred_score > avg_score:
            rcount += increment
            pred_score -= deviation
            ret -= rcount
            increment /= 2

    elif pred_score < avg_score:
        while pred_score < avg_score:
            lcount += increment
            pred_score += deviation
            ret += lcount
            increment /= 2
         
    return find_corresponding_letter(ret)


In [208]:
def make_prediction(building_type, occupation, num_buildings, area):
    
    odf = get_mongo_dataframe()

    variables = ['Total GHG Emissions (Metric Tons CO2e)', 'Electricity Use']

    for v in variables:
        cols = [var for var in variables if var != v]
        copydf = odf.drop(cols, axis=1)

        types = get_building_types(copydf, 1000)
        if building_type not in types:
            raise BaseException('Building type DNE')
        
        try:
            df = copydf[copydf['Primary Property Type - Self Selected'].str.contains(building_type) == True]
            df = df.drop('Primary Property Type - Self Selected', axis=1)
            df, regression, y = clean_dataframe(df, v)
            df, expected, actual = predict_data(df, regression, 300, y)
            
            prediction = np.array([area, num_buildings, occupation]).reshape(1, -1)
            predEm = regression.predict(prediction)[0]
            predEm = math.e ** predEm
            
            print(find_letter_grade(math.log(predEm), sum([math.log(a) for a in actual]) / len(actual),
                    statistics.stdev([math.log(a) for a in actual])))

        except Exception as e:
            print(e)
            raise(e)


In [223]:
make_prediction('Office', 25, 1, 500000)

Pinged your deployment. You successfully connected to MongoDB!
B-
B-


In [224]:
make_prediction('Multifamily Housing', 25, 1, 500000)

Pinged your deployment. You successfully connected to MongoDB!
D-
F
