In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("Task_furniture v2.csv", sep=";")
print(df.shape)
print(df.info())
print(df.DwellingType.unique())
print(df.Lifestage.unique)

# checking where the missing values are
df.isna().any()

In [None]:
# outliers - detecting by z-score
from scipy import stats

print(df[(np.abs(stats.zscore(df["Age"])) > 3)])
df[(np.abs(stats.zscore(df["Salary"])) > 3)]

# dropping outliers
df = df[(np.abs(stats.zscore(df["Age"])) < 3)]

In [None]:
# pairwise variables visualization
# Create the default pairplot
sns.pairplot(df.drop("ID", axis=1))

In [None]:
# Create a pair plot colored by continent with a density plot of the # diagonal and format the scatter plots.
sns.pairplot(df.drop(columns = ['ID']), hue = 'City', diag_kind = 'kde',
             plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
             size = 4)

In [None]:
# missing values
df.isnull().sum()

In [None]:
# encoding the categorical variables

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

def pre_modeling(df, classification = True):
    df_modelling = df.copy()
    df_modelling['City'] = df_modelling['City'].astype(str)
    
    if classification == False:
        df_modelling['Gender'] = df_modelling['Gender'].astype(str)
        df_modelling['Gender'] = df_modelling['Gender'].replace(['0','1'],["Female", "Male"])
        # here is for Linear Regression
        df_modelling = pd.get_dummies(df_modelling, columns = ['Gender', 'City', 'Lifestage', 'DwellingType'], drop_first=True)
        df_modelling.drop(columns=['Target', 'ID'], inplace=True)
        df_modelling.dropna(axis=0, subset=['Salary'], inplace=True)
                
    else:
        label_encoder = LabelEncoder()
        df_modelling.iloc[:, 3] = label_encoder.fit_transform(df_modelling.iloc[:, 3])
        df_modelling.iloc[:, 4] = label_encoder.fit_transform(df_modelling.iloc[:, 4])
        df_modelling.iloc[:, 5] = label_encoder.fit_transform(df_modelling.iloc[:, 5])
        #df_modelling = pd.get_dummies(df_modelling, columns = ['Gender', 'City', 'Lifestage', 'DwellingType'], drop_first=False)
        df_modelling.drop(columns=['Salary', 'ID'], inplace=True)
    
    return df_modelling

# print(df_modelling.info())
# df_modelling.head(4)

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

df_modelling = pre_modeling(df, classification=False)    
print(df_modelling.shape)
print(df_modelling.head(2))

y = df_modelling.pop('Salary')
x = df_modelling

X2 = sm.add_constant(x)
est = sm.OLS(y, X2)
est2 = est.fit()

# coeff_parameter = pd.DataFrame(model.coef_, x.columns, columns=['Coefficient'])
# print(coeff_parameter)
est2.summary()


In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train_salary, y_test_salary = train_test_split(x, y, test_size=0.2, random_state=1234)

feature_names = [f'feature {i}' for i in range(x.shape[1])]
forest = RandomForestClassifier(random_state=1234)
forest.fit(X_train, y_train_salary)

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(forest, X_test, y_test_salary, n_repeats=10, random_state=1234, n_jobs=1)

forest_importances = pd.Series(result.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
x.columns[[0,12]]

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(forest, random_state=1234).fit(X_test, y_test_salary)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

# Classification - Target

In [None]:
# Mixed Naive Bayes for Classification

df_modelling = pre_modeling(df, classification=True)  

y = df_modelling.pop('Target')
x = df_modelling
print(df_modelling.head(1))
print(y[0:4])
x.head(5)

In [None]:
x.info()

In [None]:
# Use a utility from sklearn to split and shuffle your dataset.
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
# summarize
print('Train', x_train.shape, y_train.shape)
print('Test', x_test.shape, y_test.shape)
print(pd.value_counts(y_train))
print(pd.value_counts(y_test))

In [None]:
from mixed_naive_bayes import MixedNB

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn import metrics

def naive_bayes_model(x, y, imblance_method):

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
    nb_mod = MixedNB(categorical_features=[1,2,3,4])

    if imblance_method == "No":
        test_pred = nb_mod.fit(X_train, y_train).predict(X_test)
        model_roc_auc_score = roc_auc_score(y_test, test_pred)
        print('roc_auc_score=%.3f' % (model_roc_auc_score))
        nb_precision, nb_recall, _ = precision_recall_curve(y_test, test_pred)
        nb_f1, nb_auc = f1_score(y_test, test_pred), metrics.auc(nb_recall, nb_precision)
        print('f1=%.3f precision/recall=%.3f' % (nb_f1, nb_auc))
        
    elif imblance_method == "Undersampling":
        # summarize class distribution
        print("Before undersampling: ", Counter(y_train))
        # define undersampling strategy
        undersample = RandomUnderSampler(sampling_strategy='majority', random_state = 1234)
        # fit and apply the transform
        X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
        # summarize class distribution
        print("After undersampling: ", Counter(y_train_under))
        test_pred = nb_mod.fit(X_train_under, y_train_under).predict(X_test)
        model_roc_auc_score = roc_auc_score(y_test, test_pred)
        print('roc_auc_score=%.3f' % (model_roc_auc_score))
        nb_precision, nb_recall, _ = precision_recall_curve(y_test, test_pred)
        nb_f1, nb_auc = f1_score(y_test, test_pred), metrics.auc(nb_recall, nb_precision)
        print('f1=%.3f precision/recall=%.3f' % (nb_f1, nb_auc))
     
        
    elif imblance_method == "Oversampling":
        print("Before undersampling: ", Counter(y_train))
        # define oversampling strategy
        SMOTE_mod = SMOTE()
        # fit and apply the transform
        X_train_SMOTE, y_train_SMOTE = SMOTE_mod.fit_resample(X_train, y_train)
        # summarize class distribution
        print("After oversampling: ", Counter(y_train_SMOTE))
        nb_mod = MixedNB(categorical_features=[1,2,3,4])
        test_pred = nb_mod.fit(X_train_SMOTE, y_train_SMOTE).predict(X_test)
        model_roc_auc_score = roc_auc_score(y_test, test_pred)
        print('roc_auc_score=%.3f' % (model_roc_auc_score))
        nb_precision, nb_recall, _ = precision_recall_curve(y_test, test_pred)
        nb_f1, nb_auc = f1_score(y_test, test_pred), metrics.auc(nb_recall, nb_precision)
        print('f1=%.3f precision/recall=%.3f' % (nb_f1, nb_auc))

    return model_roc_auc_score, nb_f1, nb_auc          
            

In [None]:
naive_bayes_model(x, y, imblance_method="Oversampling")

In [None]:
naive_bayes_model(x, y, imblance_method="Undersampling")

In [None]:
naive_bayes_model(x, y, imblance_method="No")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
nb_mod = MixedNB(categorical_features=[1,2,3,4])
nb_mod.fit(x_train, y_train)
perm = PermutationImportance(nb_mod, random_state=1234).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist())