In [178]:
# UPDATE TO MATCH PERSONAL DIRECTORY STRUCTURE
DIR = 'C:\\Users\\jesse\\Documents\\GitHub\\vanity_sizing_project\\data\\'

In [179]:
import pandas as pd
import pickle

In [180]:
with open(DIR+'rtr_reviews_uncleaned.pkl', 'rb') as f:
    data = pickle.load(f)

customers = data[['user_id', 'weight', 'body type', 'height', 'size', 'age','band_size', 'cup_size']]

customers = customers.drop_duplicates(subset='user_id')

non_numerical_cols = customers.select_dtypes(exclude=['int64', 'float64']).columns

for col in non_numerical_cols:
    customers[col] = pd.factorize(customers[col])[0]

customers_complete = customers.dropna()

In [181]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

def predict_fill_regress(complete_df, col):
    X = complete_df.drop(columns=[col, 'user_id'])
    y = complete_df[col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model1 = RandomForestRegressor()
    model1.fit(X_train, y_train)
    y_pred1 = model1.predict(X_test)
    rmse1 = sqrt(mean_squared_error(y_test, y_pred1))
    
    model2 = LinearRegression()
    model2.fit(X_train, y_train)
    y_pred2 = model2.predict(X_test)
    rmse2 = sqrt(mean_squared_error(y_test, y_pred2))

    model3 = GradientBoostingRegressor()
    model3.fit(X_train, y_train)
    y_pred3 = model3.predict(X_test)
    rmse3 = sqrt(mean_squared_error(y_test, y_pred3))

    #return the model with the lowest RMSE
    if rmse1 < rmse2 and rmse1 < rmse3:
        string = "Random Forest Regressor:" + str(rmse1)
        return model1, string
    elif rmse2 < rmse1 and rmse2 < rmse3:
        string = ("Linear Regression:" + str(rmse2))
        return model2, string
    else:
        string =("Gradient Boosting Regressor:" + str(rmse3))
        return model3,string

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def predict_fill_classify(complete_df, col):
    X = complete_df.drop(columns=[col, 'user_id'])
    y = complete_df[col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model1 = RandomForestClassifier()
    model1.fit(X_train, y_train)
    y_pred1 = model1.predict(X_test)
    acc1 = accuracy_score(y_test, y_pred1)
    
    model2 = LogisticRegression(max_iter=1000)
    model2.fit(X_train, y_train)
    y_pred2 = model2.predict(X_test)
    acc2 = accuracy_score(y_test, y_pred2)

    model3 = GradientBoostingClassifier()
    model3.fit(X_train, y_train)
    y_pred3 = model3.predict(X_test)
    acc3 = accuracy_score(y_test, y_pred3)

    # return the model with the highest accuracy
    if acc1 > acc2 and acc1 > acc3:
        print("Random Forest Classifier Accuracy: " + str(acc1))
        return model1
    elif acc2 > acc1 and acc2 > acc3:
        print("Logistic Regression Accuracy: " + str(acc2))
        return model2
    else:
        print("Gradient Boosting Classifier Accuracy: " + str(acc3))
        return model3

In [182]:
def fill_nulls(df, col, model):
    nulls = df[df[col].isnull()]
    X = nulls.drop(columns=[col, 'user_id'])
    y = model.predict(X)
    nulls[col] = y
    df = pd.concat([df.dropna(), nulls])
    return df

In [183]:
customers_filled = customers_complete.copy()

regress = ['height','weight','age']
classify = ['body type','band_size','cup_size']

null_fill_table = pd.DataFrame(columns=['column','nulls','model','accuracy'])

for col in regress:
    model, string = predict_fill_regress(customers_complete, col)
    nulls = customers[col].isnull().sum()
    customers = fill_nulls(customers, col, model)
    null_fill_table = null_fill_table.append({'column':col,'nulls':nulls,'model':string}, ignore_index=True)

for col in classify:
    nulls = customers[col].isnull().sum()
    customers = customers.dropna(subset=[col])
    null_fill_table = null_fill_table.append({'column':col,'nulls':nulls,'model':'dropped'}, ignore_index=True)

null_fill_table

ValueError: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [158]:
predict_fill_regress(customers_complete,'height')

Gradient Boosting Regressor:2.093130140943341


In [160]:
predict_fill_regress(customers_complete,'weight')

Random Forest Regressor:11.155223879575582


In [162]:
predict_fill_regress(customers_complete,'age')

Gradient Boosting Regressor:7.763285031253687


In [163]:
predict_fill_classify(customers_complete,'body type')

0.36092696143425373


In [164]:
predict_fill_classify(customers_complete,'band_size')

0.588170865279299


In [165]:
predict_fill_classify(customers_complete,'cup_size')

0.39413155012394074
