In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv("Task_furniture v2.csv", sep=";")
print(df.shape)
print(df.info())
print(df.DwellingType.unique())
print(df.Lifestage.unique)

# checking where the missing values are
df.isna().any()

(4000, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            4000 non-null   int64  
 1   Age           4000 non-null   int64  
 2   Gender        4000 non-null   int64  
 3   City          4000 non-null   int64  
 4   Lifestage     4000 non-null   object 
 5   DwellingType  4000 non-null   object 
 6   Salary        3996 non-null   float64
 7   Target        4000 non-null   int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 250.1+ KB
None
['House' 'Apartment own' 'Apartment rent']
<bound method Series.unique of 0                                     Single
1       Family with young children (0-6 yr.)
2       Family with young children (0-6 yr.)
3                                     Single
4                                     Single
                        ...                 
3995                                  Sin

ID              False
Age             False
Gender          False
City            False
Lifestage       False
DwellingType    False
Salary           True
Target          False
dtype: bool

In [57]:
# outliers - detecting by z-score
from scipy import stats

print(df[(np.abs(stats.zscore(df["Age"])) > 3)])
df[(np.abs(stats.zscore(df["Salary"])) > 3)]

# dropping outliers
df = df[(np.abs(stats.zscore(df["Age"])) < 3)]

        ID  Age  Gender  City                             Lifestage  \
331    331  199       0     2  Family with young children (0-6 yr.)   
558    558  178       0     2                                Single   
1003  1003  110       0     2                                Single   
1007  1007   99       0     1                                Single   

        DwellingType   Salary  Target  
331   Apartment rent  31333.0       0  
558            House  31118.0       0  
1003           House  28504.0       0  
1007           House  33808.0       0  


In [None]:
# pairwise variables visualization
# Create the default pairplot
sns.pairplot(df.drop("ID", axis=1))

In [None]:
# Create a pair plot colored by continent with a density plot of the # diagonal and format the scatter plots.
sns.pairplot(df.drop(columns = ['ID']), hue = 'City', diag_kind = 'kde',
             plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
             size = 4)

In [None]:
# missing values
df.isnull().sum()

In [58]:
# encoding the categorical variables

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

def pre_modeling(df, classification = True):
    df_modelling = df.copy()
    df_modelling['City'] = df_modelling['City'].astype(str)
    
    if classification == False:
        df_modelling['Gender'] = df_modelling['Gender'].astype(str)
        df_modelling['Gender'] = df_modelling['Gender'].replace(['0','1'],["Female", "Male"])
        # here is for Linear Regression
        df_modelling = pd.get_dummies(df_modelling, columns = ['Gender', 'City', 'Lifestage', 'DwellingType'], drop_first=True)
        df_modelling.drop(columns=['Target', 'ID'], inplace=True)
        df_modelling.dropna(axis=0, subset=['Salary'], inplace=True)
                
    else:
        label_encoder = LabelEncoder()
        df_modelling.iloc[:, 3] = label_encoder.fit_transform(df_modelling.iloc[:, 3])
        df_modelling.iloc[:, 4] = label_encoder.fit_transform(df_modelling.iloc[:, 4])
        df_modelling.iloc[:, 5] = label_encoder.fit_transform(df_modelling.iloc[:, 5])
        #df_modelling = pd.get_dummies(df_modelling, columns = ['Gender', 'City', 'Lifestage', 'DwellingType'], drop_first=False)
        df_modelling.drop(columns=['Salary', 'ID'], inplace=True)
    
    return df_modelling

# print(df_modelling.info())
# df_modelling.head(4)

In [59]:
# Linear Regression
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

df_modelling = pre_modeling(df, classification=False)    
print(df_modelling.shape)
print(df_modelling.head(2))

y = df_modelling.pop('Salary')
x = df_modelling

X2 = sm.add_constant(x)
est = sm.OLS(y, X2)
est2 = est.fit()

# coeff_parameter = pd.DataFrame(model.coef_, x.columns, columns=['Coefficient'])
# print(coeff_parameter)
est2.summary()


(3992, 14)
   Age   Salary  Gender_Male  City_2  City_3  City_4  City_5  \
0   62  40562.0            0       0       0       0       0   
1   19  27359.0            1       0       0       1       0   

   Lifestage_Established family (7-17 yr.)  \
0                                        0   
1                                        0   

   Lifestage_Family with young children (0-6 yr.)  \
0                                               0   
1                                               1   

   Lifestage_Family with youth at home  Lifestage_Middle aged couples  \
0                                    0                              0   
1                                    0                              0   

   Lifestage_Single  DwellingType_Apartment rent  DwellingType_House  
0                 1                            0                   1  
1                 0                            0                   1  


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Salary,R-squared:,0.896
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,2649.0
Date:,"Sun, 12 Sep 2021",Prob (F-statistic):,0.0
Time:,13:36:19,Log-Likelihood:,-34300.0
No. Observations:,3992,AIC:,68630.0
Df Residuals:,3978,BIC:,68720.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.938e+04,111.309,263.917,0.000,2.92e+04,2.96e+04
Age,159.2504,1.863,85.481,0.000,155.598,162.903
Gender_Male,1406.0784,42.854,32.811,0.000,1322.061,1490.096
City_2,-5354.1926,53.024,-100.978,0.000,-5458.148,-5250.237
City_3,-2504.8435,83.723,-29.918,0.000,-2668.988,-2340.699
City_4,-7464.3489,58.687,-127.190,0.000,-7579.408,-7349.290
City_5,-7324.1275,70.928,-103.262,0.000,-7463.186,-7185.069
Lifestage_Established family (7-17 yr.),-190.7368,98.883,-1.929,0.054,-384.602,3.129
Lifestage_Family with young children (0-6 yr.),-138.2492,90.445,-1.529,0.126,-315.572,39.074

0,1,2,3
Omnibus:,8.653,Durbin-Watson:,2.052
Prob(Omnibus):,0.013,Jarque-Bera (JB):,8.772
Skew:,-0.095,Prob(JB):,0.0124
Kurtosis:,3.128,Cond. No.,424.0


In [60]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train_salary, y_test_salary = train_test_split(x, y, test_size=0.2, random_state=1234)

feature_names = [f'feature {i}' for i in range(x.shape[1])]
forest = RandomForestClassifier(random_state=1234)
forest.fit(X_train, y_train_salary)

RandomForestClassifier(random_state=0)

In [68]:
from sklearn.inspection import permutation_importance

result = permutation_importance(forest, X_test, y_test_salary, n_repeats=10, random_state=1234, n_jobs=1)

forest_importances = pd.Series(result.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

KeyboardInterrupt: 

In [64]:
x.columns[[0,12]]

Index(['Age', 'DwellingType_House'], dtype='object')

In [70]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(forest, random_state=1234).fit(X_test, y_test_salary)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0  ± 0.0000,DwellingType_House
0  ± 0.0000,Lifestage_Single
0  ± 0.0000,Lifestage_Middle aged couples
0  ± 0.0000,Lifestage_Family with youth at home
0  ± 0.0000,Lifestage_Family with young children (0-6 yr.)
0  ± 0.0000,Lifestage_Established family (7-17 yr.)
0  ± 0.0000,City_5
0  ± 0.0000,City_4
0  ± 0.0000,City_3
0  ± 0.0000,City_2


# Classification - Target

In [71]:
# Mixed Naive Bayes for Classification

df_modelling = pre_modeling(df, classification=True)  

y = df_modelling.pop('Target')
x = df_modelling
print(df_modelling.head(1))
print(y[0:4])
x.head(5)

   Age  Gender  City  Lifestage  DwellingType
0   62       0     0          5             2
0    1
1    0
2    0
3    0
Name: Target, dtype: int64


Unnamed: 0,Age,Gender,City,Lifestage,DwellingType
0,62,0,0,5,2
1,19,1,3,2,2
2,30,1,2,2,0
3,35,1,3,5,2
4,53,1,3,5,0


In [None]:
x.info()

In [72]:
# Use a utility from sklearn to split and shuffle your dataset.
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
# summarize
print('Train', x_train.shape, y_train.shape)
print('Test', x_test.shape, y_test.shape)
print(pd.value_counts(y_train))
print(pd.value_counts(y_test))

Train (3196, 5) (3196,)
Test (800, 5) (800,)
0    2779
1     417
Name: Target, dtype: int64
0    705
1     95
Name: Target, dtype: int64


In [None]:
from mixed_naive_bayes import MixedNB

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn import metrics

def naive_bayes_model(x, y, imblance_method):

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
    nb_mod = MixedNB(categorical_features=[1,2,3,4])

    if imblance_method == "No":
        test_pred = nb_mod.fit(X_train, y_train).predict(X_test)
        model_roc_auc_score = roc_auc_score(y_test, test_pred)
        print('roc_auc_score=%.3f' % (model_roc_auc_score))
        nb_precision, nb_recall, _ = precision_recall_curve(y_test, test_pred)
        nb_f1, nb_auc = f1_score(y_test, test_pred), metrics.auc(nb_recall, nb_precision)
        print('f1=%.3f precision/recall=%.3f' % (nb_f1, nb_auc))
        
    elif imblance_method == "Undersampling":
        # summarize class distribution
        print("Before undersampling: ", Counter(y_train))
        # define undersampling strategy
        undersample = RandomUnderSampler(sampling_strategy='majority', random_state = 1234)
        # fit and apply the transform
        X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
        # summarize class distribution
        print("After undersampling: ", Counter(y_train_under))
        test_pred = nb_mod.fit(X_train_under, y_train_under).predict(X_test)
        model_roc_auc_score = roc_auc_score(y_test, test_pred)
        print('roc_auc_score=%.3f' % (model_roc_auc_score))
        nb_precision, nb_recall, _ = precision_recall_curve(y_test, test_pred)
        nb_f1, nb_auc = f1_score(y_test, test_pred), metrics.auc(nb_recall, nb_precision)
        print('f1=%.3f precision/recall=%.3f' % (nb_f1, nb_auc))
     
        
    elif imblance_method == "Oversampling":
        print("Before undersampling: ", Counter(y_train))
        # define oversampling strategy
        SMOTE_mod = SMOTE()
        # fit and apply the transform
        X_train_SMOTE, y_train_SMOTE = SMOTE_mod.fit_resample(X_train, y_train)
        # summarize class distribution
        print("After oversampling: ", Counter(y_train_SMOTE))
        nb_mod = MixedNB(categorical_features=[1,2,3,4])
        test_pred = nb_mod.fit(X_train_SMOTE, y_train_SMOTE).predict(X_test)
        model_roc_auc_score = roc_auc_score(y_test, test_pred)
        print('roc_auc_score=%.3f' % (model_roc_auc_score))
        nb_precision, nb_recall, _ = precision_recall_curve(y_test, test_pred)
        nb_f1, nb_auc = f1_score(y_test, test_pred), metrics.auc(nb_recall, nb_precision)
        print('f1=%.3f precision/recall=%.3f' % (nb_f1, nb_auc))

    return model_roc_auc_score, nb_f1, nb_auc          
            

In [None]:
naive_bayes_model(x, y, imblance_method="Oversampling")

In [None]:
naive_bayes_model(x, y, imblance_method="Undersampling")

In [None]:
naive_bayes_model(x, y, imblance_method="No")

In [74]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
nb_mod = MixedNB(categorical_features=[1,2,3,4])
nb_mod.fit(x_train, y_train)
perm = PermutationImportance(nb_mod, random_state=1234).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist())

[2 5 6 3]


Weight,Feature
0.1305  ± 0.0138,City
0.0845  ± 0.0154,Age
0.0238  ± 0.0118,Lifestage
0.0070  ± 0.0058,DwellingType
0.0018  ± 0.0020,Gender
