In [0]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers 


In [3]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

df.drop("id", inplace=True, axis=1)

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

nonna = df[df["income"].notna()]
isna = df[df["income"].isnull()]
nx = nonna.drop(["income", "product"], axis=1).values
ny = nonna["income"].values
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size=0.2)
reg = LinearRegression().fit(nx_train, ny_train)
isna["income"] = reg.predict(isna.drop(["income", "product"], axis=1).values)
df = pd.concat([isna, nonna], axis=0)
reg.score(nx_test, ny_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.7881944431091034

In [0]:
x_columns = df.columns.drop('product')
x = df[x_columns].values
dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

In [8]:
kf = KFold(5, shuffle=True, random_state=69)

oos_y = []
oos_pred = []
fold = 0


for train, test in kf.split(x):
  fold += 1
  print(f"Fold #{fold}")

  x_train = x[train]
  x_test = x[test]
  y_train = y[train]
  y_test = y[test]

  model = Sequential()
  model.add(Dense(50, input_dim=x.shape[1], activation="relu",
                  activity_regularizer=regularizers.l1(1e-4)))
  model.add(Dense(25, activation="relu",
                  activity_regularizer=regularizers.l1(1e-4)))
  model.add(Dense(y.shape[1], activation="softmax"))
  model.compile(loss="categorical_crossentropy", optimizer="adam")
  model.fit(x_train, y_train, validation_data=(x_test, y_test), verbose=0, epochs=500)

  pred = model.predict(x_test)
  pred = np.argmax(pred, axis=1)
  y_compare = np.argmax(y_test, axis=1)

  score = metrics.accuracy_score(y_compare, pred)
  print(f"Fold accuracy: {score}")

  oos_y.append(y_compare)
  oos_pred.append(pred)

oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = metrics.accuracy_score(oos_y, oos_pred)

print(f"Final accuracy: {score}")


Fold #1
Fold accuracy: 0.5925
Fold #2
Fold accuracy: 0.6975
Fold #3
Fold accuracy: 0.685
Fold #4
Fold accuracy: 0.655
Fold #5
Fold accuracy: 0.54
Final accuracy: 0.634
