# Model 2.1 and 2.2: LDA & QDA

## Import the libraries and dataset

In [1]:
### Importing the libraries
import numpy as np
import pandas as pd
#from sklearnex import patch_sklearn

#patch_sklearn()

#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

In [2]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
train_df.head()

Unnamed: 0,id,occ_code_level2,age,stock_dividends,mig_chg_msa,tax_filer_stat,det_hh_summ,mig_prev_sunbelt,hisp_origin,education,wage_per_hour,capital_losses,vet_question,own_or_self,country_self,mig_move_reg,high_income,hs_college,class_worker,mig_same,unemp_reason,state_prev_res,ind_code_level2,race,country_mother,capital_gains,sex,ind_code_level1,citizenship,union_member,fam_under_18,marital_stat,region_prev_res,mig_chg_reg,country_father,occ_code_level1,full_or_part_emp,weeks_worked,det_hh_fam_stat,num_emp,vet_benefits
0,1,0,42.0,0.0,,Nonfiler,Householder,,All other,11th grade,0.0,,Not in universe,0,United-States,,0,Not in universe,Not in universe,Not in universe under 1 year old,Not in universe,Not in universe,0,Black,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Not in universe,,United-States,Not in universe,Not in labor force,0.0,Householder,0,2
1,2,18,56.0,,,,Householder,,All other,High school graduate,0.0,,Not in universe,2,United-States,,1,Not in universe,Self-employed-incorporated,Not in universe under 1 year old,,Not in universe,32,,United-States,,Male,Wholesale trade,Native- Born in the United States,Not in universe,Not in universe,Married-civilian spouse present,Not in universe,,United-States,Sales,Full-time schedules,,Householder,1,2
2,3,26,26.0,,,Joint both under 65,Householder,,All other,High school graduate,0.0,,Not in universe,0,Haiti,,0,Not in universe,Private,Not in universe under 1 year old,,Not in universe,41,,Haiti,,,Hospital services,Foreign born- Not a citizen of U S,Not in universe,Not in universe,,Not in universe,,Haiti,Adm support including clerical,Full-time schedules,,Householder,3,2
3,4,0,67.0,,MSA to MSA,Joint one under 65 & one 65+,Householder,No,All other,,0.0,,No,0,United-States,Same county,0,Not in universe,Not in universe,No,,North Carolina,0,Black,United-States,0.0,,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Northeast,Same county,United-States,Not in universe,Children or Armed Forces,0.0,Householder,0,1
4,5,0,,,Nonmover,Nonfiler,Child under 18 never married,Not in universe,All other,Children,0.0,0.0,Not in universe,0,United-States,Nonmover,0,Not in universe,Not in universe,Yes,Not in universe,Not in universe,0,White,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Mother only present,,Not in universe,Nonmover,United-States,Not in universe,Children or Armed Forces,,Child <18 never marr not in subfamily,0,0


In [3]:
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [4]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [5]:
X= train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [6]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## LDA: GridSearch

In [20]:

numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearDiscriminantAnalysis())]
)


set_config(display="diagram")
clf

In [23]:
hyper_param = {"classifier__shrinkage": np.arange(0,1,0.1),
              "classifier__solver":["lsqr"]}
lda_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [24]:
lda_pipe_cv.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [25]:
pd.DataFrame(lda_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__shrinkage,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.626899,0.023521,0.264019,0.004786,0.0,lsqr,"{'classifier__shrinkage': 0.0, 'classifier__so...",0.852683,0.854239,0.844245,0.832891,0.849007,0.846613,0.007676,4
1,5.725652,0.141292,0.290516,0.009808,0.1,lsqr,"{'classifier__shrinkage': 0.1, 'classifier__so...",0.84957,0.852316,0.843787,0.850197,0.851845,0.849543,0.003051,1
2,6.224072,0.325152,0.345671,0.047079,0.2,lsqr,"{'classifier__shrinkage': 0.2, 'classifier__so...",0.850577,0.851035,0.842963,0.848823,0.848274,0.848334,0.002878,2
3,6.581901,0.080728,0.31146,0.010498,0.3,lsqr,"{'classifier__shrinkage': 0.30000000000000004,...",0.84902,0.849203,0.840857,0.847633,0.846534,0.84665,0.003055,3
4,6.879059,0.18449,0.311128,0.004811,0.4,lsqr,"{'classifier__shrinkage': 0.4, 'classifier__so...",0.846914,0.846365,0.839117,0.845802,0.844703,0.84458,0.002828,5
5,6.596735,0.040164,0.315682,0.009216,0.5,lsqr,"{'classifier__shrinkage': 0.5, 'classifier__so...",0.843069,0.844534,0.837378,0.844611,0.843787,0.842676,0.002708,6
6,6.794045,0.170737,0.322603,0.013351,0.6,lsqr,"{'classifier__shrinkage': 0.6000000000000001, ...",0.838949,0.84197,0.836553,0.842597,0.840765,0.840167,0.002193,7
7,6.883154,0.102625,0.400939,0.043781,0.7,lsqr,"{'classifier__shrinkage': 0.7000000000000001, ...",0.836294,0.836294,0.832524,0.8393,0.835821,0.836047,0.002152,8
8,8.191922,0.407987,0.332428,0.018057,0.8,lsqr,"{'classifier__shrinkage': 0.8, 'classifier__so...",0.828053,0.827962,0.828221,0.830968,0.830235,0.829088,0.00126,9
9,6.133875,1.518156,0.322799,0.050953,0.9,lsqr,"{'classifier__shrinkage': 0.9, 'classifier__so...",0.815602,0.811207,0.814211,0.816409,0.814028,0.814291,0.001777,10


In [None]:
### Poor cross validation results compared to other models !

## QDA: First insights

In [10]:
### What is the best imputaiton method ? let's try some and evaluate using cross-validation
qda_res = {}
for i in ("mean","median"):
    
    numeric_transformer = Pipeline(
        steps=[("imputer_num", SimpleImputer(strategy=i)), ("scaler", StandardScaler())]
    )
    for j in ("most_frequent","constant"):
        if j == "most_frequent" : 
            categorical_transformer = Pipeline(
                steps=[("imputer_cat", SimpleImputer(strategy=j)),
                       ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
            )

            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", numeric_transformer, numerical_features),
                    ("cat", categorical_transformer, categorical_features),
                ]
            )

            # Append classifier to preprocessing pipeline.
            # Now we have a full prediction pipeline.
            clf = Pipeline(
                steps=[("preprocessor", preprocessor), ("classifier", QuadraticDiscriminantAnalysis())]
            )
            score = cross_val_score(clf, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=1),scoring = "accuracy",n_jobs = -1, verbose=2).mean()
            qda_res[(i,j)] = score
        else : 
            categorical_transformer = Pipeline(
                steps=[("imputer_cat", SimpleImputer(strategy=j)),
                       ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
            )

            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", numeric_transformer, numerical_features),
                    ("cat", categorical_transformer, categorical_features),
                ]
            )

            # Append classifier to preprocessing pipeline.
            # Now we have a full prediction pipeline.
            clf = Pipeline(
                steps=[("preprocessor", preprocessor), ("classifier", QuadraticDiscriminantAnalysis())]
            )
            score = cross_val_score(clf, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=1),scoring = "accuracy",n_jobs = -1, verbose=2).mean()
            qda_res[(i,j)] = score


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.3s finished


In [11]:
qda_res

{('mean', 'most_frequent'): 0.562582854161903,
 ('mean', 'constant'): 0.5651105127650041,
 ('median', 'most_frequent'): 0.5599279428203897,
 ('median', 'constant'): 0.572783346683807}

In [1]:
# Because the results are really poor, we don't even consider this model and do not perform a GridSearch. It looks like the decision boundary is more linear than quadratic.