## Importing the needed libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  f1_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier



%matplotlib inline


## Importing data

In [None]:
path_train_X='./training_set_features.csv'
path_train_Y='./training_set_labels.csv'
path_test_X='./test_set_features.csv'

In [None]:
train_set = pd.read_csv(path_train_X)
print(train_set.head())

In [None]:
train_set_target = pd.read_csv(path_train_Y)
print(train_set_target.head())
print(train_set_target.shape)

In [None]:
test_set_X = pd.read_csv(path_test_X)
test_set_X.head()

In [None]:
train_set.info()

In [None]:
test_set_X.info()

## Dealing with null values:

In [None]:
train_set.isnull().sum()

### Deleting columns with too mant missing values:

In [None]:
sums = train_set.isnull().sum()
columns = list(sums[sums > 0.4 * train_set.shape[0]].keys())
columns

In [None]:
df = train_set.drop(columns=columns)
df = df.drop(columns=['respondent_id',])
df.isnull().sum()[:5]

In [None]:
df.head()

## Trying to predict with rows with that doesn't have any mising values:  
Here I tried to  train a model on 2 different datasets: one achieved through imputaionand the other through deleting missing values if at least one exist in a given row.

In [None]:
train_set2 = pd.concat((df,train_set_target),axis=1).dropna(axis=0)
train_set2.shape, '<----',df.shape, 'null values :' , train_set2.isnull().sum().sum()

In [None]:
train_set2.head()

In [None]:
df2 = train_set2.drop(train_set_target.columns,axis=1)
df2_target = train_set2[train_set_target.columns]

### Imputing the others with the most frequent value:

In [None]:
impt = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
df = pd.DataFrame(impt.fit_transform(df),columns=df.columns)
df.isnull().sum().sum()

In [None]:
df.head()

## using LabelEncoder to label the columns:

In [None]:
le = LabelEncoder()
classes= {}
for col in df.columns:
    df[col] = le.fit_transform(df[col])
    df2[col] = le.fit_transform(df2[col])
    classes[col] = le.classes_
classes    

## Extracting non-binary features:

In [None]:
columns_categorical=[]
for col in df.columns[1:]:
    if len(df[col].unique()) > 2:
        columns_categorical.append(col)
columns_categorical

## OHE for the non-binary columns

In [None]:
X = pd.get_dummies(df,columns=columns_categorical)
X2 = pd.get_dummies(df2,columns=columns_categorical)
X.shape,X2.shape

## Feature Extraction using PCA:

In [None]:
pca = PCA()
pca.fit(X)
num = (pca.explained_variance_ratio_*100).astype(int).sum()
pca = PCA(n_components=num)
X_reduced = pca.fit_transform(X)
X_reduced[:5,:5]

In [None]:
pca = PCA()
pca.fit(X2)
num = (pca.explained_variance_ratio_*100).astype(int).sum()
pca = PCA(n_components=num)
X_reduced2 = pca.fit_transform(X2)
X_reduced2[:5,:5]

## First dealing with seasonal vaccine

In [None]:
train_set_target.head()

In [None]:
Y_seasonal = train_set_target['seasonal_vaccine'].to_numpy().ravel()
Y_seasonal2 = df2_target['seasonal_vaccine'].to_numpy().ravel()

## Applying Train/Valid/Test split

In [None]:
X_train,X_valid,Y_train,Y_valid = train_test_split(X,Y_seasonal,test_size=.1,random_state=102,stratify=Y_seasonal)
X_train2,X_valid2,Y_train2,Y_valid2 = train_test_split(X2,Y_seasonal2,test_size=.1,random_state=102,stratify=Y_seasonal2)

## Training a basic model on the both datasets: 
This a model with minimum parameters.The classification metric used in the f1 score which is a more accurate metric, other than the regular "accuracy" metric.

In [136]:
logreg = LogisticRegression(max_iter=500,random_state=101)
logreg.fit(X_train2,Y_train2)
print('train: {:.2f}% , '.format(f1_score(logreg.predict(X_train2),Y_train2)*100) , \
'valid: {:.2f}%'.format(f1_score(logreg.predict(X_valid2),Y_valid2)*100))

train: 77.62% ,  valid: 76.55%


In [137]:
logreg = LogisticRegression(max_iter=500,random_state=101)
logreg.fit(X_train,Y_train)
print('train: {:.2f}% , '.format(f1_score(logreg.predict(X_train),Y_train)*100) , \
'valid: {:.2f}%'.format(f1_score(logreg.predict(X_valid),Y_valid)*100))

train: 75.80% ,  valid: 76.77%


### Note:
We can clearly see that there isn't much of a difference concerning the validation accuracy. That's why it's better to stick with real data than the ones manipulated through imputation

## Applying a logistic regression model:

In [None]:
logreg = LogisticRegression(max_iter=1001,random_state=101,solver='liblinear',penalty='l1')
logreg.fit(X_train,Y_train)
logreg.score(X_train,Y_train)*100, '%'

In [None]:
print(f1_score(logreg.predict(X_valid),Y_valid)*100,'%')


In [None]:
pd.DataFrame(classification_report(logreg.predict(X_valid),Y_valid,output_dict = True)).T*100

## Only keeping the important columns:

In [None]:
mask = abs(logreg.coef_ )  > 1e-5
mask = mask.ravel()
mask.sum()

In [None]:
X_reduced = X_reduced[:,mask]
X_train,X_valid,Y_train,Y_valid = train_test_split(X_reduced,Y_seasonal,test_size=.1,random_state=102,stratify=Y_seasonal)
X_train.shape[1]

In [None]:
logreg = LogisticRegression(max_iter=1001,random_state=101,solver='liblinear',penalty='l1')
logreg.fit(X_train,Y_train)
'{:.2f}'.format(logreg.score(X_train,Y_train)*100) , '{:.2f}'.format(f1_score(logreg.predict(X_valid),Y_valid)*100),'%'