In [None]:

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# Load Data
train = pd.read_csv('../input/birdsong-recognition/train.csv')
test = pd.read_csv('../input/birdsong-recognition/test.csv')

In [None]:
train.info(),test.info()

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of Missing Values', fontsize=15)
plt.title('Percentage of Missing Data by Feature', fontsize=15)
missing_data.head()

In [None]:
# let's go ahead and have a look at how many observations we would drop
print('Total bird records with values in all variables: ', train.dropna().shape[0])
print('Total bird records: ', train.shape[0])

In [None]:
# Load Data
train = pd.read_csv('../input/titanic/train.csv')
train.head()

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of Missing Values', fontsize=15)
plt.title('Percentage of Missing Data by Feature', fontsize=15)
missing_data.head()

In [None]:
#Let us take Age feature and find the total no of records it has null values
df = train
df['Age'].isnull().sum()

In [None]:
df['Age'].replace(np.NaN,df['Age'].mean()).head(15)

In [None]:
df_median = train
df_median['Age'].fillna(df_median['Age'].median(),inplace=True)
df_median['Age']

In [None]:
# Load Data
train = pd.read_csv('../input/birdsong-recognition/train.csv')

In [None]:
# As per the categorical features which have missing values I have choosen "background" as feature for Mode imputation

data_cat=train
data_cat['background'].fillna(data_cat['background'].mode()[0], inplace=True)
data_cat['background'].head()

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])

In [None]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(train, train.Survived, test_size=0.3,random_state=0)
X_train.shape, X_test.shape

In [None]:
# let's make a function to create 3 variables from Age 1-filling NA with median, 2- random sampling or 3- zeroes

def impute_na(df, variable, median):
    df[variable+'_median'] = df[variable].fillna(median)
    df[variable+'_zero'] = df[variable].fillna(0)

    # random sampling
    df[variable+'_random'] = df[variable]
    # extract the random sample to fill the na
    random_sample = X_train[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)
    # pandas needs to have the same index in order to merge datasets
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(), variable+'_random'] = random_sample

In [None]:
median = X_train.Age.mean()
impute_na(X_train, 'Age', median)
X_train.head(20)

In [None]:
# Let us see the distribution of the Age variable after filling NA with random value
fig = plt.figure()
ax = fig.add_subplot(111)
X_train['Age'].plot(kind='kde', ax=ax)
X_train.Age_random.plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
# Let us see distribution of the Age variable after filling NA with median value
fig = plt.figure()
ax = fig.add_subplot(111)
X_train['Age'].plot(kind='kde', ax=ax)
X_train.Age_median.plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
# Let us see distribution of the Age variable after filling NA with zero value
fig = plt.figure()
ax = fig.add_subplot(111)
X_train['Age'].plot(kind='kde', ax=ax)
X_train.Age_zero.plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])

In [None]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(train, train.Survived, test_size=0.3,random_state=0)
X_train.shape, X_test.shape

In [None]:
#Let us define a function with replacement of NA value with an arbitrary value as 0 and 100
def impute_na(df, variable):
    df[variable+'_zero'] = df[variable].fillna(0)
    df[variable+'_hundred']= df[variable].fillna(100)

In [None]:
# let's replace the NA with the median value in the training set
impute_na(X_train, 'Age')
X_train.head()

In [None]:
train = pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])
train.head()

In [None]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(train, train.Survived, test_size=0.3,random_state=0)
X_train.shape, X_test.shape

In [None]:
X_train.Age.hist(bins=50)

In [None]:
# far end of the distribution
X_train.Age.mean()+3*X_train.Age.std()

In [None]:
# Let us see if there are a few outliers for Age, according to its distribution these outliers will be masked when we replace NA by values at the far end
sns.boxplot('Age', data=train)

In [None]:
def impute_na(df, variable, median, extreme):
    df[variable+'_far_end'] = df[variable].fillna(extreme)
    df[variable].fillna(median, inplace=True)

In [None]:
# let's replace the NA with the median value in the training and testing sets
impute_na(X_train, 'Age', X_train.Age.median(), X_train.Age.mean()+3*X_train.Age.std())
X_train.head(20)

In [None]:
# As you can see an accumulation of values around the median for the median imputation
X_train.Age.hist(bins=50)

In [None]:
# Now finally let us see an accumulation of values at the far end imputation
X_train.Age_far_end.hist(bins=50)

In [None]:
# Far end imputation now indicates that there are no outliers in the variable as shown below
sns.boxplot('Age_far_end', data=X_train)


![](https://miro.medium.com/max/2560/1*wYbTRM0dgnRzutwZq63xCg.png)


In [None]:
import pandas as pd

train_df = pd.read_csv('../input/titanic/train.csv', usecols=['Sex'])
train_df.head()

In [None]:
pd.get_dummies(train_df).head()

In [None]:
# For a clear understanding let us visualise like below
pd.concat([train_df, pd.get_dummies(train_df)], axis=1).head()

In [None]:
pd.get_dummies(train_df, drop_first=True).head()

In [None]:
train_df=pd.read_csv('../input/titanic/train.csv', usecols=['Embarked'])
train_df.head()

In [None]:
# Let us check the number of unique emabrked labels
train_df.Embarked.unique()

In [None]:
# Now let us get the complete set of dummy variables for embarked feature

pd.get_dummies(train_df).head()

In [None]:
# Now let us get k-1 dummy variables

pd.get_dummies(train_df, drop_first=True).head()

In [None]:
# Also to mention we may have some missing values in this feature so it is better to include an additional dummy variable to indicate whether there was missing data

pd.get_dummies(train_df, drop_first=True, dummy_na=True).head()

In [None]:
# Now let us find out how many observations we have for each variable (i.e., each category)

pd.get_dummies(train_df, drop_first=True, dummy_na=True).sum(axis=0)

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
train_df=pd.read_csv('../input/titanic/train.csv')
train_df.head()

In [None]:
# Now let us a copy of the above dataset, in which we encode the categorical variables using One Hot Encoder

train_df_OneHotEncoder = pd.concat([train_df[['Pclass', 'Age', 'SibSp','Parch', 'Survived']], # Choosen the numerical variables
                      pd.get_dummies(train_df.Sex, drop_first=True),   # Sex as explained above which is binary categorical variable
                      pd.get_dummies(train_df.Embarked, drop_first=True)],  # Embarked as explained above has k categories in categorical
                    axis=1)

train_df_OneHotEncoder.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df_OneHotEncoder[['Pclass', 'Age', 'SibSp',
                                                              'Parch', 'male', 'Q', 'S']].fillna(0),
                                                    train_df_OneHotEncoder.Survived,
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
# let's build a random forest model with the above data

rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=3)
rf.fit(X_train, y_train)
print('Train set')
pred = rf.predict_proba(X_train)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = rf.predict_proba(X_test)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [None]:
# Now let us build a AdaBoost classifier

ada = AdaBoostClassifier(n_estimators=200, random_state=44)
ada.fit(X_train, y_train)
print('Train set')
pred = ada.predict_proba(X_train)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = ada.predict_proba(X_test)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [None]:
# Finally with logistic regression
logit = LogisticRegression(random_state=44)
logit.fit(X_train, y_train)
print('Train set')
pred = logit.predict_proba(X_train)
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = logit.predict_proba(X_test)
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [None]:
# let's load the variable Cabin of the titanic dataset

train_df=pd.read_csv('../input/titanic/train.csv', usecols = ['Cabin'])
train_df.head()

In [None]:
# Now let's inspect the number of unique labels in Cabin feature
print('Number of unique labels in Cabin Feature: {}'.format(len(train_df.Cabin.unique())))

In [None]:
# Now let us see how many features we can create if we did One Hot Encoder for Cabin feature
Cabin_OneHotEncoder = pd.get_dummies(train_df.Cabin)
Cabin_OneHotEncoder.shape

In [None]:
Cabin_OneHotEncoder.head()

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

data = pd.read_csv('../input/titanic/train.csv', usecols=['Embarked', 'Survived'])
data.head()

In [None]:
# let's have a look at how many labels

for col in data.columns[1:]:
    print(col, ': ', len(data[col].unique()), ' labels')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['Embarked']], data.Survived,
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
# Let's obtain the counts for each one of the labels in variable Embarked and capture this in a dictionary that we can use to re-map the labels

X_train.Embarked.value_counts().to_dict()

In [None]:
# lets look at X_train so we can compare then the variable re-coding

X_train.head()

In [None]:
# And now let's replace each label in X2 by its count.Firstly we make a dictionary that maps each label to the counts
X_frequency_map = X_train.Embarked.value_counts().to_dict()

# and now we replace X2 labels both in train and test set with the same map
X_train.Embarked = X_train.Embarked.map(X_frequency_map)
X_test.Embarked = X_test.Embarked.map(X_frequency_map)

X_train.head()

In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
pd.options.display.float_format = '{:.2f}'.format

df = pd.read_csv('../input/titanic/train.csv', usecols=['Embarked', 'Survived'])
X = df.drop('Survived', axis = 1)
y = df.drop('Embarked', axis = 1)

In [None]:
binary_encoder = ce.BinaryEncoder(cols = ['Embarked'])
binary_encoder.fit_transform(X, y)

In [None]:
ordinal_encoder = ce.OrdinalEncoder(cols = ['Embarked'])
ordinal_encoder.fit_transform(X, y['Survived'])

In [None]:
BaseN_encoder = ce.BaseNEncoder(cols = ['Embarked'])
BaseN_encoder.fit_transform(X, y)

In [None]:
Hashing_encoder = ce.HashingEncoder(cols = ['Embarked'])
Hashing_encoder.fit_transform(X, y)

In [None]:
Sum_encoder = ce.SumEncoder(cols = ['Embarked'])
Sum_encoder.fit_transform(X, y)

In [None]:
ce_helmert = ce.HelmertEncoder(cols = ['Embarked'])
ce_helmert.fit_transform(X, y)

In [None]:
ce_backward = ce.BackwardDifferenceEncoder(cols = ['Embarked'])
ce_backward.fit_transform(X, y)

In [None]:
ce_poly = ce.PolynomialEncoder(cols = ['Embarked'])
ce_poly.fit_transform(X, y)

In [None]:
ce_target = ce.TargetEncoder(cols = ['Embarked'])
ce_target.fit(X, y)
ce_target.transform(X, y)

In [None]:
# Target with higher smoothing
ce_target_leaf = ce.TargetEncoder(cols = ['Embarked'], smoothing = 10)
ce_target_leaf.fit(X, y)
ce_target_leaf.transform(X, y)

In [None]:
# Target with lower smoothing
ce_target_leaf = ce.TargetEncoder(cols = ['Embarked'], smoothing = .10)
ce_target_leaf.fit(X, y)
ce_target_leaf.transform(X, y)

In [None]:
!pip install chart-studio

In [None]:
!pip install woe

In [None]:
# Import packages
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
import os
import woe
from woe.eval import plot_ks
print(os.listdir("../input"))
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
import warnings
warnings.filterwarnings('ignore')
import chart_studio.plotly.plotly as py
import chart_studio.plotly
max_bin = 20
force_bin = 3

In [None]:
df = pd.read_csv('../input/uci-credit-carefrom-python-woe-pkg/UCI_Credit_Card.csv',sep=',')

In [None]:
df.head()

In [None]:
df.info()

Define a binning function for continuous independent variables

In [None]:
def mono_bin(Y, X, n = max_bin):
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
        d2 = d1.groupby('Bucket', as_index=True)

    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)

    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)

    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()

    return(d3)


In [None]:
def char_bin(Y, X):

    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    df2 = notmiss.groupby('X',as_index=True)

    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y

    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)

    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)

    return(d3)

In [None]:
def data_vars(df1, target):

    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]

    x = df1.dtypes.index
    count = -1

    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1

            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)

    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [None]:
final_iv, IV = data_vars(df,df.target)

In [None]:
final_iv

In [None]:
IV.sort_values('IV',ascending=False)

In [None]:
# Using the category encoder library
ce_WOE = ce.WOEEncoder(cols = ['Embarked'])
ce_WOE.fit(X, y)
ce_WOE.transform(X, y)

In [None]:
# let's load again the titanic dataset

data = pd.read_csv('../input/titanic/train.csv', usecols=['Cabin', 'Survived'])
data.head()

In [None]:
# let's first fill NA values with an additional label

data.Cabin.fillna('Missing', inplace=True)
data.head()

In [None]:
# Cabin has indeed a lot of labels, here for simplicity, I will capture the first letter of the cabin, but the procedure could be done as well without any prior variable manipulation

len(data.Cabin.unique())

In [None]:
# Now we extract the first letter of the cabin
data['Cabin'] = data['Cabin'].astype(str).str[0]
data.head()

In [None]:
# check the labels
data.Cabin.unique()

In [None]:
#The calculation of the WoE to replace the labels should be done considering the ONLY the training set, and then expanded it to the test set.
# Let's divide into train and test set

X_train, X_test, y_train, y_test = train_test_split(data[['Cabin', 'Survived']], data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
# Now we calculate the probability of target=1
X_train.groupby(['Cabin'])['Survived'].mean()

In [None]:
# Let's make a dataframe with the above calculation

prob_df = X_train.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df

In [None]:
# and now the probability of target = 0 and we add it to the dataframe

prob_df = X_train.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df['Died'] = 1-prob_df.Survived
prob_df

In [None]:
# Since the log of zero is not defined, let's set this number to something small and non-zero

prob_df.loc[prob_df.Survived == 0, 'Survived'] = 0.00001
prob_df

In [None]:
# Finally it is time to  calculate the Weight of Evidence (WoE)

prob_df['WoE'] = np.log(prob_df.Survived/prob_df.Died)
prob_df

In [None]:
# Let us create a dictionary to re-map the variable

prob_df['WoE'].to_dict()

In [None]:
# Now we make a dictionary to map the orignal variable to the WoE but we capture the dictionary in a variable

ordered_labels = prob_df['WoE'].to_dict()
# Replace the labels with the above label for WoE

X_train['Cabin_ordered'] = X_train.Cabin.map(ordered_labels)
X_test['Cabin_ordered'] = X_test.Cabin.map(ordered_labels)
X_train.head()

In [None]:
# Plot the original variable
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure()
fig = X_train.groupby(['Cabin'])['Survived'].mean().plot()
fig.set_title('Normal relationship between variable and target')
fig.set_ylabel('Survived')

In [None]:
# Plot the transformed result: the monotonic variable

fig = plt.figure()
fig = X_train.groupby(['Cabin_ordered'])['Survived'].mean().plot()
fig.set_title('Monotonic relationship between variable and target')
fig.set_ylabel('Survived')

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('../input/titanic/train.csv', usecols=['Embarked', 'Survived'])
data.head()

In [None]:
# let's check at the different number of labels within each variable
cols_to_use = ['Embarked']

for col in cols_to_use:
    print('Variable: ', col, ' Number of Labels: ', len(data[col].unique()))

print('Total passengers: ', len(data))

In [None]:
# let's plot how frequently appears each label within a variable in the dataset

total_passengers = len(data)

for col in cols_to_use:
    # count the number of observations per label and divide by total
    # number of cars
    temp_df = pd.Series(data[col].value_counts() / total_passengers)

    # make plot with the above percentages
    fig = temp_df.sort_values(ascending=False).plot.bar()
    fig.set_xlabel(col)
    fig.set_ylabel('Percentage of Passengers')
    plt.show()

In [None]:
# I will work first the the variable Embarked

# Let's calculate again the frequency of the different categories/labels in Embarked

temp_df = pd.Series(data['Embarked'].value_counts() / total_passengers).reset_index()
temp_df.columns = ['Embarked', 'Percentage of Passengers']
temp_df

In [None]:
# Now let's calculate the mean "time to pass testing" for each label in Embarked

data.groupby(['Embarked'])['Survived'].mean().reset_index()

In [None]:
ce_leave = ce.LeaveOneOutEncoder(cols = ['Embarked'])
ce_leave.fit(X, y)
ce_leave.transform(X, y)

In [None]:
ce_James = ce.JamesSteinEncoder(cols = ['Embarked'])
ce_James.fit(X, y)
ce_James.transform(X, y)

In [None]:
ce_M_Estimator = ce.MEstimateEncoder(cols = ['Embarked'])
ce_M_Estimator.fit(X, y)
ce_M_Estimator.transform(X, y)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
# load the numerical variables of the Titanic Dataset

train_data = pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])
train_data.head()

In [None]:
# first fill the missing data of the variable age, with a random sample of the variable

def impute_na(data, variable):
    # function to fill na with a random sample
    df = data.copy()

    # random sampling
    df[variable+'_random'] = df[variable]

    # extract the random sample to fill the na
    random_sample = df[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)

    # pandas needs to have the same index in order to merge datasets
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(), variable+'_random'] = random_sample

    return df[variable+'_random']

In [None]:
# fill nul values for Age
train_data['Age'] = impute_na(train_data, 'Age')

In [None]:
# Plot the histograms to have a quick look at the distributions and  plot Q-Q plots to visualise if the variable is normally distributed

def diagnostic_plots(df, variable):
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable

    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=pylab)

    plt.show()

diagnostic_plots(train_data, 'Age')

In [None]:
train_data['Age_log'] = np.log(train_data.Age)
diagnostic_plots(train_data, 'Age_log')

In [None]:
train_data['Age_reciprocal'] = 1 / train_data.Age
diagnostic_plots(train_data, 'Age_reciprocal')

In [None]:
train_data['Age_sqr'] =train_data.Age**(1/2)
diagnostic_plots(train_data, 'Age_sqr')

In [None]:
train_data['Age_exp'] = train_data.Age**(1/1.2)
diagnostic_plots(train_data, 'Age_exp')

In [None]:
train_data['Age_yeojohnson'], param = stats.yeojohnson(train_data.Age)
print('Optimal λ: ', param)
diagnostic_plots(train_data, 'Age_yeojohnson')

In [None]:
train_data['Age_boxcox'], param = stats.boxcox(train_data.Age)
print('Optimal λ: ', param)
diagnostic_plots(train_data, 'Age_boxcox')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
equal_width_data = pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])
equal_width_data.head()

In [None]:
# Let's separate into train and test set

X_train, X_test, y_train, y_test = train_test_split(equal_width_data[['Age', 'Fare', 'Survived']], equal_width_data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
# replace NA in both  train and test sets

X_train['Age'] = impute_na(equal_width_data, 'Age')
X_test['Age'] = impute_na(equal_width_data, 'Age')

In [None]:
# Let us remind ourselves of the distribution of Age
equal_width_data.Age.hist()

In [None]:
fig = plt.figure()
fig = equal_width_data.groupby(['Age'])['Survived'].mean().plot()
fig.set_title('Normal relationship between Age and Survived')
fig.set_ylabel('Survived')

In [None]:
fig = plt.figure()
fig = equal_width_data.groupby(['Age'])['Survived'].count().plot()
fig.set_title('Number of people per year age bin')
fig.set_ylabel('Survived')

In [None]:
# Let us capture the range of the variable age to begin with

Age_range = X_train.Age.max() - X_train.Age.min()
Age_range

In [None]:
# Now let us capture the lower and upper boundaries

min_value = int(np.floor(X_train.Age.min()))
max_value = int(np.ceil(X_train.Age.max()))

# let's round the bin width
inter_value = int(np.round(Age_range/10))

min_value, max_value, inter_value

In [None]:
# Let us capture the interval limits, so we can pass them to the pandas cut function to generate the bins

intervals = [i for i in range(min_value, max_value+inter_value, inter_value)]
intervals

In [None]:
# let's make labels to label the different bins
labels = ['Bin_'+str(i) for i in range(1,len(intervals))]
labels

In [None]:
# create Binned age groups

# create one column with labels
X_train['Age_disc_labels'] = pd.cut(x = X_train.Age, bins=intervals, labels=labels, include_lowest=True)

# and one with bin boundaries
X_train['Age_disc'] = pd.cut(x = X_train.Age, bins=intervals, include_lowest=True)

X_train.head()

In [None]:
X_train.groupby('Age_disc')['Age'].count()

In [None]:
X_train.groupby('Age_disc')['Age'].count().plot.bar()

In [None]:
X_test['Age_disc_labels'] = pd.cut(x = X_test.Age, bins=intervals, labels=labels, include_lowest=True)
X_test['Age_disc'] = pd.cut(x = X_test.Age, bins=intervals,  include_lowest=True)

X_test.head()

In [None]:
# If the distributions in train and test set are similar, we should expect similar distribution of observations in the different intervals in the train and test set.

t1 = X_train.groupby(['Age_disc'])['Survived'].count() / np.float(len(X_train))
t2 = X_test.groupby(['Age_disc'])['Survived'].count() / np.float(len(X_test))
temp = pd.concat([t1,t2], axis=1)
temp.columns = ['train', 'test']
temp.plot.bar()

In [None]:
# Now let's observe the relationship between age and surival again, using the discrete Age transformed variable

fig = plt.figure()
fig = X_train.groupby(['Age_disc'])['Survived'].mean().plot(figsize=(12,6))
fig.set_title('Normal relationship between variable and target')
fig.set_ylabel('Survived')

In [None]:
fig = plt.figure()
fig = X_train.groupby(['Age_disc'])['Survived'].count().plot(figsize=(12,6))
fig.set_title('Number of Passengers within each Age bin')
fig.set_ylabel('No of Passengers')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
equal_freq_data= pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])
equal_freq_data.head()

In [None]:
# Let's separate into train and test set

X_train, X_test, y_train, y_test = train_test_split(equal_freq_data[['Age', 'Fare', 'Survived']],equal_freq_data.Survived, test_size=0.3, random_state=0)
X_train.shape, X_test.shape

In [None]:
# replace NA in both train and test sets

X_train['Age'] = impute_na(equal_freq_data, 'Age')
X_test['Age'] = impute_na(equal_freq_data, 'Age')

In [None]:
# let's remind ourselves of the original distribution

equal_freq_data.Age.hist()

In [None]:
# We will divide Age in 5 bins. I use the qcut (quantile cut) function from pandas and I indicate that I want 4 cutting points, thus 5 bins. retbins= True indicates that I want to capture the limits of each interval (so I can then use them to cut the test set)

Age_disccretised, intervals = pd.qcut(equal_freq_data.Age, 4, labels=None, retbins=True, precision=3, duplicates='raise')
pd.concat([Age_disccretised, equal_freq_data.Age], axis=1).head(10)

In [None]:
intervals

In [None]:
# Calculate number of passengers per bin
temp = pd.concat([Age_disccretised, equal_freq_data.Age], axis=1)
temp.columns = ['Age_discretised', 'Age']
temp.groupby('Age_discretised')['Age'].count()

In [None]:
# We can also add labels instead of having the interval boundaries, to the bins, as follows:

Age_disccretised, intervals = pd.qcut(equal_freq_data.Age, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'], retbins=True, precision=3, duplicates='raise')
Age_disccretised.head()

In [None]:
# We will divide into 10 quantiles for the rest of the exercise. I will leave the quantile boundary and generate labels as well for comparison

# create 10 labels, one for each quantile

labels = ['Q'+str(i+1) for i in range(0,10)]
print(labels)

# bins with labels
X_train['Age_disc_label'], bins = pd.qcut(x=X_train.Age, q=10, labels=labels, retbins=True, precision=3, duplicates='raise')

# bins with boundaries
X_train['Age_disc'], bins = pd.qcut(x=X_train.Age, q=10, retbins=True, precision=3, duplicates='raise')


X_train.head(10)

In [None]:
X_test['Age_disc_label'] = pd.cut(x = X_test.Age, bins=bins, labels=labels)
X_test['Age_disc'] = pd.cut(x = X_test.Age, bins=bins)

X_test.head()

In [None]:
# let's check that we have equal frequency (equal number of observations per bin)
X_test.groupby('Age_disc')['Age'].count()

In [None]:
t1 = X_train.groupby(['Age_disc'])['Survived'].count() / np.float(len(X_train))
t1

In [None]:
t2 = X_test.groupby(['Age_disc'])['Survived'].count() / np.float(len(X_test))
t2

In [None]:
temp = pd.concat([t1,t2], axis=1)
temp.columns = ['train', 'test']
temp

In [None]:
temp.plot.bar()

In [None]:
# Let's observe the relationship between age and survival again, using the discrete Age transformed variable

fig = plt.figure()
fig = X_train.groupby(['Age_disc'])['Survived'].mean().plot(figsize=(12,6))
fig.set_title('Normal relationship between variable and target')
fig.set_ylabel('Survived')

In [None]:
# order the intervals according to survival rate
ordered_labels = X_train.groupby(['Age_disc_label'])['Survived'].mean().sort_values().index

# number the intervals according to survival rate
ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)}

# remap the intervals to the encoded variable
X_train['Age_disc_ordered'] = X_train.Age_disc_label.map(ordinal_label)
X_test['Age_disc_ordered'] = X_test.Age_disc_label.map(ordinal_label)

X_train.head()

In [None]:
# Plot the monotonic relationship
fig = plt.figure()
fig = X_train.groupby(['Age_disc_ordered'])['Survived'].mean().plot()
fig.set_title('Monotonic relationship between discretised Age and target')
fig.set_ylabel('Survived')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import cross_val_score
# load the numerical variables of the Titanic Dataset
data_decision_tree = pd.read_csv('../input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])
data_decision_tree.head()

In [None]:
# Let's separate into train and test set

X_train, X_test, y_train, y_test = train_test_split(data_decision_tree[['Age', 'Fare', 'Survived']],
                                                    data_decision_tree.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
X_train['Age'] = impute_na(data_decision_tree, 'Age')
X_test['Age'] = impute_na(data_decision_tree, 'Age')

In [None]:
# Let us now  build Classification tree using Age to predict Survived

tree_model = DecisionTreeClassifier(max_depth=2)
tree_model.fit(X_train.Age.to_frame(), X_train.Survived)
X_train['Age_tree'] = tree_model.predict_proba(X_train.Age.to_frame())[:,1]
X_train.head(10)

In [None]:
# monotonic relationship with target

fig = plt.figure()
fig = X_train.groupby(['Age_tree'])['Survived'].mean().plot()
fig.set_title('Monotonic relationship between discretised Age and target')
fig.set_ylabel('Survived')

In [None]:
# Number of passengers per probabilistic bucket / bin

X_train.groupby(['Age_tree'])['Survived'].count().plot.bar()

In [None]:
# Median age within each bucket originated by the tree

X_train.groupby(['Age_tree'])['Age'].median().plot.bar()

In [None]:
# Now let us see the Age limits buckets generated by the tree by capturing the minimum and maximum age per each probability bucket, we get an idea of the bucket cut-offs

pd.concat( [X_train.groupby(['Age_tree'])['Age'].min(),
            X_train.groupby(['Age_tree'])['Age'].max()], axis=1)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# To display the total number columns present in the dataset
pd.set_option('display.max_columns', None)

In [None]:
# Let us load the titanic dataset

data = pd.read_csv('../input/titanic/train.csv')
data.head()

In [None]:
data.Age.describe()

In [None]:
Upper_boundary_limit = data.Age.mean() + 3* data.Age.std()
Lower_boundary_limit = data.Age.mean() - 3* data.Age.std()

Upper_boundary_limit, Lower_boundary_limit

In [None]:
IQR = data.Age.quantile(0.75) - data.Age.quantile(0.25)

Lower_quantile_lower = data.Age.quantile(0.25) - (IQR * 1.5)
Upper_quantile_lower = data.Age.quantile(0.75) + (IQR * 1.5)

Upper_quantile_lower, Lower_quantile_lower, IQR

In [None]:
IQR = data.Age.quantile(0.75) - data.Age.quantile(0.25)

Lower_quantile = data.Age.quantile(0.25) - (IQR * 3)
Upper_quantile = data.Age.quantile(0.75) + (IQR * 3)

Upper_quantile, Lower_quantile, IQR

In [None]:
data = data.dropna(subset=['Age'])

total_passengers = np.float(data.shape[0])

print('Passengers older than 73 years old (Gaussian approach): {}'.format(data[data.Age > 73].shape[0] / total_passengers))
print('Passengers older than 65 years (IQR): {}'.format(data[data.Age > 65].shape[0] / total_passengers))
print('Passengers older than 91 years (IQR, extreme): {}'.format(data[data.Age >= 91].shape[0] / total_passengers))

In [None]:
data[(data.Age<Lower_quantile_lower)|(data.Age>Upper_quantile_lower)]

In [None]:
data_with_no_outlier = data[(data.Age>Lower_quantile_lower)&(data.Age<Upper_quantile_lower)]
data_with_no_outlier

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.head()

In [None]:
upper_threshold = df['SalePrice'].quantile(0.95)
upper_threshold

In [None]:
lower_threshold = df['SalePrice'].quantile(0.05)
lower_threshold

In [None]:
df[(df.SalePrice<lower_threshold)|(df.SalePrice>upper_threshold)]

In [None]:
data_with_no_outlier_percentile_approach = df[(df.SalePrice>lower_threshold)&(df.SalePrice<upper_threshold)]
data_with_no_outlier_percentile_approach

In [None]:
data_zscore = pd.read_csv('../input/titanic/train.csv')

In [None]:
data_zscore.Age.mean(),data_zscore.Age.std()

In [None]:
data_zscore['zscore'] = ( data_zscore.Age - data_zscore.Age.mean() ) / data_zscore.Age.std()
data_zscore.head(5)

In [None]:
(22-29.69)/14.509433962264152

In [None]:
data_zscore[(data_zscore.zscore>3) | (data_zscore.zscore<-3)]

In [None]:
df_no_outliers_zscore = data_zscore[(data_zscore.zscore>-3) & (data_zscore.zscore<3)]
df_no_outliers_zscore.shape

In [None]:
data_box_plot = pd.read_csv('../input/titanic/train.csv')

#### Let us first look at the distribution of age feature using histogram as shown below

In [None]:
fig = data_box_plot.Age.hist(bins=50)
fig.set_title('Age Distribution')
fig.set_xlabel('Age')
fig.set_ylabel('Number of Passengers')

Now let us look at the boxplot of age feature

In [None]:
fig = data_box_plot.boxplot(column='Age')
fig.set_title('')
fig.set_xlabel('Survived')
fig.set_ylabel('Age')

In [None]:
import pandas as pd
import seaborn as sns
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(14,10))
OverallQual_scatter_plot = pd.concat([train['SalePrice'],train['GarageArea']],axis = 1)
sns.regplot(x='GarageArea',y = 'SalePrice',data = OverallQual_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
TotalBsmtSF_scatter_plot = pd.concat([train['SalePrice'],train['TotalBsmtSF']],axis = 1)
sns.regplot(x='TotalBsmtSF',y = 'SalePrice',data = TotalBsmtSF_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
GrLivArea_scatter_plot = pd.concat([train['SalePrice'],train['GrLivArea']],axis = 1)
sns.regplot(x='GrLivArea',y = 'SalePrice',data = GrLivArea_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
GarageArea_scatter_plot = pd.concat([train['SalePrice'],train['BsmtFinSF1']],axis = 1)
sns.regplot(x='BsmtFinSF1',y = 'SalePrice',data = GarageArea_scatter_plot,scatter= True, fit_reg=True, ax=ax4)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,Normalizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
# load the numerical variables of the Titanic Dataset
data = pd.read_csv('../input/titanic/train.csv', usecols = ['Pclass', 'Age', 'Fare', 'Survived'])
data.head()

In [None]:
data.describe()

In [None]:
# let's look at missing data
data.isnull().sum()

In [None]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data[['Pclass', 'Age', 'Fare']],
                                                    data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
# let's fill first the missing data

X_train.Age.fillna(X_train.Age.median(), inplace=True)
X_test.Age.fillna(X_train.Age.median(), inplace=True)

In [None]:
scaler = StandardScaler() # create an object
X_train_scaled = scaler.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at the distribution of the transformed variable Age
plt.hist(X_train_scaled[:,1], bins=20)

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
minmaxscaler = MinMaxScaler() # create an object
X_train_scaled = minmaxscaler.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
maxscaler = MaxAbsScaler() # create an object
X_train_scaled = maxscaler.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
robustscaler = RobustScaler() # create an object
X_train_scaled = robustscaler.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
normalizer = Normalizer() # create an object
X_train_scaled = normalizer.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
from sklearn.preprocessing import QuantileTransformer
quantileTransformer = QuantileTransformer()
X_train_scaled = quantileTransformer.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
from sklearn.preprocessing import PowerTransformer
powerTransformer = PowerTransformer()
X_train_scaled = powerTransformer.fit_transform(X_train) # fit the scaler to the train set, and then transform it

In [None]:
#let's have a look at the scaled training dataset: mean and standard deviation
print('means (Pclass, Age and Fare): ', X_train_scaled.mean(axis=0))
print('std (Pclass, Age and Fare): ', X_train_scaled.std(axis=0))

In [None]:
# let's look at the transformed min and max values
print('Min values (Pclass, Age and Fare): ', X_train_scaled.min(axis=0))
print('Max values (Pclass, Age and Fare): ', X_train_scaled.max(axis=0))

In [None]:
# let's look at how transformed age looks like compared to the original variable
sns.jointplot(X_train.Age, X_train_scaled[:,1], kind='kde')

In [None]:
import pandas as pd
data = pd.read_csv('../input/birdsong-recognition/train.csv')
data.dtypes

In [None]:
import pandas as pd
data = pd.read_csv('../input/birdsong-recognition/train.csv')
data['date'] = pd.to_datetime(data['date'],format='%Y-%m-%d', errors='coerce')

data['year']=data['date'].dt.year
data['month']=data['date'].dt.month
data['day']=data['date'].dt.day
data['dayofweek_num']=data['date'].dt.dayofweek
data['dayofweek_name']=data['date'].dt.weekday
data.head()

In [None]:
import pandas as pd
data = pd.read_csv('../input/birdsong-recognition/train.csv')
data['time'] = pd.to_datetime(data['date'],format='%Y-%m-%d', errors='coerce')

data['time'] = pd.to_datetime(data['time'],format='%H:%M')

data['Hour'] = data['time'].dt.hour
data['minute'] = data['time'].dt.minute

data.head()

In [None]:
import pandas as pd
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data['YrSold'] = pd.to_datetime(data['YrSold'],format='%Y')

data['lag_1'] = data['SalePrice'].shift(1)
data = data[['YrSold', 'lag_1', 'SalePrice']]
data.head()

In [None]:
import pandas as pd
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data['YrSold'] = pd.to_datetime(data['YrSold'],format='%Y')

data['lag_1'] = data['SalePrice'].shift(1)
data['lag_2'] = data['SalePrice'].shift(2)
data['lag_3'] = data['SalePrice'].shift(3)
data['lag_4'] = data['SalePrice'].shift(4)
data['lag_5'] = data['SalePrice'].shift(5)
data['lag_6'] = data['SalePrice'].shift(6)
data['lag_7'] = data['SalePrice'].shift(7)
data = data[['YrSold', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'SalePrice']]
data.head()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
plot_acf(data['SalePrice'], lags=10)
plot_pacf(data['SalePrice'], lags=10)

In [None]:
import pandas as pd
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data['YrSold'] = pd.to_datetime(data['YrSold'],format='%Y')

data['rolling_mean'] = data['SalePrice'].rolling(window=7).mean()
data = data[['YrSold', 'rolling_mean', 'SalePrice']]
data.head(10)

In [None]:
import pandas as pd
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data['YrSold'] = pd.to_datetime(data['YrSold'],format='%Y')

data['Expanding_Mean'] = data['SalePrice'].expanding(2).mean()
data = data[['YrSold', 'SalePrice','Expanding_Mean']]
data.head(10)