# **Machine Learning with Scikit-Learn**

# Data Processing with SKlearn

In [None]:
import pandas as pd
import seaborn as sns
df_titan = pd.read_csv('titanic_train.csv')
df_titan.head()

## Imputation

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp = SimpleImputer(strategy='mean')

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#imp = IterativeImputer(n_nearest_features=2)

In [None]:
from sklearn.impute import KNNImputer
#imp = KNNImputer(n_neighbors=2, weights="uniform")

In [None]:
imp.fit(df_titan[['Age']])

In [None]:
df_titan['Age'] = imp.transform(df_titan[['Age']])

In [None]:
#df_titan['Age'] = imp.fit_transform(df_titan[['Age']])

In [None]:
df_titan

In [None]:
df_titan['Age'].isnull().sum()

In [None]:
imp = SimpleImputer(strategy='most_frequent')

In [None]:
df_titan = pd.DataFrame(imp.fit_transform(df_titan))  # return a ndarray

In [None]:
df_titan.head()

## Scaling

### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
df_titan = pd.read_csv('titanic_train.csv')

In [None]:
#make sure df_titan is a dataframe to use column name Age
df_titan['Age'] = scaler.fit_transform(df_titan[['Age']])
#df_titan[5] = scaler.fit_transform(df_titan[[5]])

In [None]:
df_titan.head()

In [None]:
sns.displot(df_titan['Age'])
#sns.displot(df_titan[5])

### MinMaxScaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
df_titan['Age'] = scaler.fit_transform(df_titan[['Age']])
df_titan.head()

In [None]:
sns.displot(df_titan['Age'])

## Ordinal and OneHot Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

In [None]:
df_titan['Sex'] = enc.fit_transform(df_titan[['Sex']])
df_titan.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [None]:
enc.fit_transform(df_titan[['Sex']])

In [None]:
df_titan['Sex'] = enc.fit_transform(df_titan[['Sex']])
df_titan.head()

# Supervised Learning with Basic Decision Tree

## Step 1: Creating Training/Testing Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
iris = sns.load_dataset('iris')

In [None]:
#split dataset in features and target variable
feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = iris[feature_cols]  # Features
y = iris.species  # Target variable

In [None]:
from sklearn.model_selection import train_test_split  # Import train_test_split function
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1)  # 70% training and 30% test

In [None]:
X_test

## Step 2: Instantiating the classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier  # Import Decision Tree Classifier

In [None]:
# Create Decision Tree classifer object with all default argument
clf = DecisionTreeClassifier()

## Step 3: Training the classifier

In [None]:
# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

## Step 4: Making the prediction

In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

## Step 5: Evaluating the performance

In [None]:
from sklearn import metrics  #Import scikit-learn metrics module for accuracy calculation
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

## Step 6: Visualizing the tree

In [None]:
from sklearn import tree
tree.plot_tree(clf)  # added in version 0.21.3

In [None]:
# save your tree into a file
import matplotlib.pyplot as plt
plt.figure()
tree.plot_tree(clf,
               feature_names=[
                   'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
               ])  # added in version 0.21.3
plt.savefig('tree.svg', format='svg',
            bbox_inches='tight')  #svg, png, eps, etc.