In [220]:
# basic imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning imports
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# preprocessing imports
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [221]:
df = pd.read_csv("recycled.csv")

In [222]:
df.head(3)

Unnamed: 0,Row,B3_DATUM_B_LOC,B3_REF_OD,C1_LOC_INSIDE_PLN,C4_LOC_TOP_PLN,B3_THICK1_WALL,B3_THICK2_WALL,B3_THICK3_WALL,B3_THICK4_WALL,Layout,BuildDate,Powder,MeasureSeq,Nonconformity,RowID,ColID,PlateID
0,La1,0.4171,0.4476,0.0539,0.2659,0.0158,0.016,0.0149,0.0164,11X11TA,8/22/2019,Recycled,PreEDM,False,1,1,L
1,La2,0.4164,0.4478,0.0534,0.2662,0.0168,0.0167,0.0152,0.0141,11X11TA,8/22/2019,Recycled,PreEDM,False,1,2,L
2,La3,0.4173,0.4479,0.0541,0.2668,0.0151,0.0163,0.0155,0.0151,11X11TA,8/22/2019,Recycled,PreEDM,False,1,3,L


In [223]:
#Splitting original data into train/test
df_train, df_test = train_test_split(
    df,
    test_size=0.20,
    random_state=42,
)

In [224]:
#Creating vaidation train/test sets
df_vtrain, df_vtest = train_test_split(
    df_train,
    test_size=0.20,
    random_state=42,
)

X_train = df_vtrain.drop("Nonconformity", axis=1)
y_train = df_vtrain["Nonconformity"]

X_vtest = df_vtest.drop("Nonconformity", axis=1)
y_vtest = df_vtest["Nonconformity"]


# seperating the final test set
X_test = df_test.drop("Nonconformity", axis=1)
y_test = df_test["Nonconformity"]

In [225]:
# preprocessing the data (scaling, imputing, encoding)
numeric_features = ["B3_DATUM_B_LOC", "B3_REF_OD", "C1_LOC_INSIDE_PLN", 'C4_LOC_TOP_PLN', 'B3_THICK1_WALL', 'B3_THICK2_WALL', 'B3_THICK3_WALL', 'B3_THICK4_WALL']
categorical_features = ["Layout","BuildDate","Powder","RowID", "PlateID", "ColID"]
features = numeric_features + categorical_features
target = "Nonconformity"


numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
)
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(),
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    remainder="drop",
)

In [226]:
mae_scores = []
param_grid = np.logspace(-4, 3, 20)

for k in param_grid:
    logistic_mod = make_pipeline(
        preprocessor,
        LogisticRegression(penalty='l1', C = k, solver='liblinear', random_state=42),
    )
    logistic_mod.fit(X_train, y_train)
    y_pred = logistic_mod.predict(X_vtest)
    mae_scores.append(accuracy_score(y_vtest, y_pred))
    
logistic = pd.DataFrame(
    {
        "alpha": param_grid,
        "accuracy": mae_scores,
    }
)

logistic.head(5)
#The accuracy wasn't really changing even when changing the penalty term (alpha)... means we need to create our own variables probably.

Unnamed: 0,alpha,accuracy
0,0.0001,0.984127
1,0.000234,0.984127
2,0.000546,0.984127
3,0.001274,0.984127
4,0.002976,0.984127


In [227]:
len(df[df['Nonconformity'] == False]) / (len(df[df['Nonconformity'] == False]) + len(df[df['Nonconformity'] == True]))

0.9872448979591837

In [228]:
#Testing a model
model = make_pipeline(
        preprocessor,
        LogisticRegression(penalty='l1', C = 1, solver='liblinear', random_state=42)
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
linear_mae = accuracy_score(y_test, y_pred)
linear_mae



1.0