# Handling Missing Data


In [None]:
df = pd.load_csv("diabetes.csv")
df.info()

## Set all data as NAN

In [None]:
df.insulin.replace(0, np.nan, inplace=True)
df.tricepts.replace(0, np.nan, inplace=True)
df.bmi.replace(0, np.nan, inplace=True)
df.info()

## Dropping Missing Data

In [None]:
df= df.dropna()
df.shape()

> We lost half of the data, it is not acceptable,
so we need to check for another approach

##Imputing missing data

* Making an educated guess about the missing values
* Example: Using the mean of the non-missing entries

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values='NaN', strategy="mean")
imp.fit(X)
X = imp.transform(X)

## Imputing within a pipeline

> Pipeline all but last step must be a transformer
> The last step in a Pipeline must be an Estimator

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import  SimpleImputer

imp = SimpleImputer(missing_values="NaN", strategy="mean")
logreg = LogisticRegression()
steps = [('imputation', imp),
         ("logistic_regression",logreg)]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)
# 0.75324675324675328

## Another Code Example to preprocess data

In [None]:
# Convert '?' to NaN
df[df == "?"] = np.nan

# Print the number of NaNs
print(df.isnull().sum())

# Print shape of original DataFrame
print("Shape of Original DataFrame: {}".format(df.shape))

# Drop missing values and print shape of new DataFrame
df = df.dropna()

# Print shape of new DataFrame
print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df.shape))

Doing imputer stuff with old version

In [None]:
# Import the Imputer module
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC

# Setup the Imputation transformer: imp
imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0)

# Instantiate the SVC classifier: clf
clf = SVC()

# Setup the pipeline with the required steps: steps
steps = [('imputation', imp),
        ('SVM', clf)]

Pipeline

In [None]:
# Import necessary modules
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
        ('SVM', SVC())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps=steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report( y_test,y_pred))