In [None]:
##imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

Functions

In [None]:
def lower_snake(df):
    colNames = []
    for col in df.columns:
        colNames.append(str.lower(col.replace(' ','_')))
    df.columns = colNames

def num_nulls(df, col):
    a = []
    for i in df[col]:
        a.append(str(i))
    return a

def val_cou(df):
    for col in df.columns:
        if (df[col].value_counts().sum()) < 1460:
            print((df[col].value_counts().sum()), col)
        
def response_feature_splitter(df, position: int):
    col = df.columns[position]
    features = df.copy().drop(columns=[col])
    response = df.copy()[[col]]
    return features, response   

In [None]:
##load the data
filename = 'train.csv'
houses = pd.read_csv(filename)

##convert columns to snake case
ff = [houses]
for df in ff:
    lower_snake(df)

Inspection

In [None]:
##inspect
len(houses.columns)

In [None]:
## check for duplicate values
houses2 = houses.copy().drop(columns=['id'])
houses2.duplicated().value_counts()
#no complete duplicates in the dataset

In [None]:
#obtain list of columns for easy use
columns = []
for col in houses.columns:
    columns.append(col)
print(columns)

Begin dealing with nulls

In [None]:
## check for rows with too many null values
houses2 = houses.copy()
houses3 = houses2.dropna(thresh=70)
nullFrame = houses2.loc[~houses2['id'].isin(houses3['id'])]  #looking for ids in houses2 are not in houses3 #these are the "nullFrame", the rows that have too many null values
nullFrame['id']

#not sure what this is for, but I feel like it might have taken me some work... so I am going to keep it
#nullFrame.drop(columns=['mssubclass','mszoning','lotfrontage','lotarea','street','lotshape','landcontour','utilities','poolarea','miscval','mosold','yrsold','saletype','salecondition','saleprice','lotconfig','landslope','neighborhood','condition1','condition2','bldgtype','housestyle','overallqual','paveddrive','wooddecksf','openporchsf','enclosedporch','3ssnporch', 'screenporch','overallcond', 'yearbuilt', 'yearremodadd', 'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype','garagecars', 'garagearea', 'fireplaces','masvnrarea', 'exterqual', 'extercond', 'foundation','functional','bsmtfinsf1','bsmtfinsf2','bsmtunfsf','totalbsmtsf', 'heating', 'heatingqc', 'centralair', 'electrical', '1stflrsf','2ndflrsf', 'lowqualfinsf', 'grlivarea','bsmtfullbath', 'bsmthalfbath', 'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr'])

## dropping rows with 70+ nulls
temp = houses.copy()
houses4 = temp.loc[~temp['id'].isin(nullFrame['id'])]
houses4 = houses4.reset_index()

In [None]:
#subset into numerical and categorical data
house_cats = houses4.select_dtypes(include=['object'])
house_nums = houses4.select_dtypes(include=np.number)

## columns next to their total non-null values
print(val_cou(house_cats))

In [None]:
print(val_cou(house_nums))

In [None]:
def neighborly_suggestions(features_all, y_train, y_test_loc, X_train, X_test, response_col):
    lab_enc = LabelEncoder()
    response_LE = lab_enc.fit_transform(y_train)
    response_LE_df = pd.DataFrame(response_LE, columns = ['response'])

    uno_caliente = OneHotEncoder(sparse=False)
    uno_caliente.fit(features_all)
    features_NoNAs_trans = uno_caliente.transform(X_train)
    features_test_trans = uno_caliente.transform(X_test)
    features_NoNAs_trans_df = pd.DataFrame(features_NoNAs_trans, columns=uno_caliente.get_feature_names())
    features_test_trans_df = pd.DataFrame(features_test_trans, columns=uno_caliente.get_feature_names())

    KNN_clf = KNeighborsClassifier()
    KNN_clf.fit(features_NoNAs_trans_df, response_LE_df)

    response_trans_pred = KNN_clf.predict(features_test_trans_df)
    response_pred = lab_enc.inverse_transform(response_trans_pred)

    missing_values = pd.DataFrame(response_pred, columns=[response_col])
    missing_values.index = y_test_loc.index
    missing_values.sort_index()
    #print(f"The missing value counts for columns with '?': \n{missing_values.value_counts()}")
    return missing_values

def NaN_fixer(main_df, y_train, missing_values, response_col):
    response_new = pd.concat([y_train, missing_values], axis=0)
    response_new.sort_index()
    main_df[response_col] = response_new[response_col]
    return main_df

def null_breaker_1o2(df):   #this function finds columns with null values in them and sperates them as the response for predicting their values
    resp_col = []                                                               #empty list to collect the response columns
    feats_cols = []                                                             #empty list to collect the feature columns
    for col in df.columns:
        if df[col].isna().sum() > 0:                                            #ifelse loop grouping columns that have missing values and those that don't
            resp_col.append(col)
            #print(f"columns with '?': {resp_col}")                              #'?' is project specific
        else:
            feats_cols.append(col)

    features_all = df.copy()[feats_cols]
    response_all = df.copy()[resp_col]

    return features_all, response_all  #outputs variables that are going to be needed throughout the machine

def null_breaker_2o2(features_all, response_all, df):
    for col in response_all.columns:
        y_test_loc = response_all.loc[response_all[col].isna() == True, :]             #location of all the null responses
        y_train = response_all[[col]].dropna()                                             #responses with nulls removed        
        X_train = features_all.iloc[y_train.index, :]                               #features df with the rows matching null responses removed
        X_test = features_all.iloc[y_test_loc.index, :]                           #df of feature rows with null responses
        
        missing_values = neighborly_suggestions(features_all, y_train, y_test_loc, X_train, X_test, col)
        response_new = pd.concat([y_train, missing_values], axis=0)
        response_new.sort_index()
        df = pd.concat([df, response_new], axis=1)
    return df





Filling Category Nulls

In [None]:
features_all, response_all = null_breaker_1o2(house_cats)
responses_full = null_breaker_2o2(features_all, response_all, houses4[['id']])

In [None]:
house_cats_noNaN = pd.concat([responses_full, features_all], axis=1)

In [None]:
house_cats_noNaN.isna().sum()

In [None]:
def num_nulls(df, col):
    a = []
    for i in df[col]:
        a.append(str(i))
    return a
num_nulls(house_nums, 'lotfrontage')

In [None]:
response_feature_splitter(house_nums)