# Part 2: Full Data Workflow A-Z

## Data Preparation and Feature Creation

### Arithmetic Operations

In [None]:
import pandas as pd
import numpy as np

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic.age.fillna(titanic.age.mean(), inplace = True)

In [None]:
titanic.head(10)

#### Add/Sub/Mul/Div of Columns

In [None]:
titanic.sibsp + titanic.parch

In [None]:
titanic.sibsp.add(titanic.parch)

In [None]:
titanic["no_relat"] = titanic.sibsp.add(titanic.parch)

In [None]:
titanic.head()

In [None]:
sales = pd.read_csv("sales.csv", index_col = 0)

In [None]:
sales

In [None]:
sales.Mon + sales.Thu

In [None]:
sales.Mon.add(sales.Thu, fill_value=0)

In [None]:
sales["perc_Bonus"] = [0.12, 0.15, 0.10, 0.20]

In [None]:
sales

In [None]:
sales.Thu * sales.perc_Bonus

In [None]:
sales.Thu.mul(sales.perc_Bonus, fill_value=0)

In [None]:
sales.iloc[:, :-1].sum(axis = 1).mul(sales.perc_Bonus)

In [None]:
sales["Bonus"] = sales.iloc[:, :-1].sum(axis = 1).mul(sales.perc_Bonus)

In [None]:
sales

#### Add/Sub/Mul/Div with Scaler Value

In [None]:
titanic.head()

In [None]:
1912 - titanic.age

In [None]:
titanic["YoB"] = titanic.age.sub(1912).mul(-1)

In [None]:
titanic.head()

In [None]:
fx_rate = 1.1

In [None]:
titanic["EUR_fare"] = titanic.fare.div(fx_rate)

In [None]:
titanic.head()

In [None]:
titanic.drop(columns = ["sibsp", "parch", "deck", "YoB", "EUR_fare"], inplace =True)

In [None]:
titanic.head()

In [None]:
sales

In [None]:
fixed_costs = 5

In [None]:
sales.iloc[:, :-2].sub(fixed_costs, fill_value = 0)

In [None]:
perc_Bonus = 0.1

In [None]:
sales.iloc[:, :-2].mul(perc_Bonus, fill_value = 0)

In [None]:
sales.iloc[:,:-2]

In [None]:
lot_size = 10
bonus_per_lot = 1.25

In [None]:
sales.iloc[:, :-2].floordiv(lot_size, fill_value = 0).mul(bonus_per_lot).sum(axis = 1)

### Transformation / Mapping

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head()

In [None]:
sample = summer.sample(n = 7, random_state = 123).sort_values(by = "Year")

In [None]:
sample

In [None]:
city_country = {"Paris":"France", "Mexico":"Mexico", "Montreal":"Canada", "Moscow":"Russia", "Barcelona":"Spain", "Athens": "Greece"}

In [None]:
city_country

In [None]:
sample.City.map(city_country)

In [None]:
sample["Host_Country"] = sample.City.map(city_country)

In [None]:
sample

In [None]:
titanic.head()

In [None]:
mapper = {1:"First", 2:"Second", 3:"Third"}

In [None]:
titanic.pclass.map(mapper)

In [None]:
titanic.pclass = titanic.pclass.map(mapper)

In [None]:
titanic.head()

### Conditional Transformation

In [None]:
titanic.head(10)

In [None]:
titanic.no_relat == 0

In [None]:
pd.Series(np.where(titanic.no_relat == 0, "Yes", "No"))

In [None]:
titanic["alone"] = pd.Series(np.where(titanic.no_relat == 0, "Yes", "No"))

In [None]:
titanic.head(10)

In [None]:
titanic["child"] = pd.Series(np.where(titanic.age < 18, "Yes", "No"))

In [None]:
titanic.head(10)

### Discretization and Binning with pd.cut() (Part 1)

In [None]:
titanic.head(10)

In [None]:
age_bins = [0, 10, 18, 30, 55, 100]

In [None]:
cats = pd.cut(titanic.age, age_bins, right = False)

In [None]:
cats

In [None]:
cats.value_counts()

In [None]:
titanic["age_cat"] = cats

In [None]:
titanic.head()

In [None]:
titanic.groupby("age_cat").survived.mean()

In [None]:
group_names = ["child", "teenager", "young_adult", "adult", "elderly"]

In [None]:
pd.cut(titanic.age, age_bins, right = False, labels = group_names)

In [None]:
titanic["age_cat"] = pd.cut(titanic.age, age_bins, right = False, labels = group_names)

In [None]:
titanic.head(10)

In [None]:
titanic.age_cat

### Discretization and Binning with pd.cut() (Part 2)

In [None]:
titanic.fare

In [None]:
pd.cut(titanic.fare, 5, precision= 3)

In [None]:
titanic["fare_cat"] = pd.cut(titanic.fare, 5, precision= 0)

In [None]:
titanic.head(10)

In [None]:
titanic.fare_cat.value_counts()

### Discretization and Binning with pd.qcut() 

In [None]:
titanic.head()

In [None]:
pd.qcut(titanic.fare, 5) 

In [None]:
titanic["fare_cat"] = pd.qcut(titanic.fare, 5) 

In [None]:
titanic.head()

In [None]:
titanic.fare_cat.value_counts()

In [None]:
pd.qcut(titanic.fare, [0, 0.1, 0.25, 0.5, 0.9, 1], precision = 0) 

In [None]:
fare_labels =["very_cheap", "cheap", "moderate", "exp", "very_exp"]

In [None]:
titanic["fare_cat"] =  pd.qcut(titanic.fare, [0, 0.1, 0.25, 0.5, 0.9, 1], precision = 0, labels = fare_labels) 

In [None]:
titanic.head()

In [None]:
titanic.fare_cat.value_counts()

In [None]:
titanic.groupby(["age_cat", "fare_cat"]).survived.mean().unstack()

### Caps and Floors

In [None]:
titanic.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
titanic.fare.plot(figsize = (12,8))
plt.show()

In [None]:
titanic.fare.describe()

In [None]:
titanic.fare.sort_values(ascending = False)

In [None]:
fare_cap = 250

In [None]:
titanic.loc[titanic.fare > fare_cap, "fare"] = fare_cap

In [None]:
fare_floor = 5

In [None]:
titanic.loc[titanic.fare < fare_floor, "fare"] = fare_floor

In [None]:
titanic.head()

### Scaling / Standardization

In [None]:
titanic.head()

In [None]:
titanic.describe()

In [None]:
import matplotlib.pyplot as plt

In [None]:
titanic.fare.plot(figsize = (12,8))
titanic.age.plot(figsize = (12,8))
plt.show()

In [None]:
mean_age = titanic.age.mean()
mean_fare = titanic.fare.mean()

In [None]:
std_age = titanic.age.std()
std_fare = titanic.fare.std()

In [None]:
titanic["age_z"] = round((titanic.age-mean_age) / std_age,2)
titanic["fare_z"] = round((titanic.fare-mean_fare) / std_fare,2)

In [None]:
titanic.head(10)

In [None]:
round(titanic.describe(),2)

In [None]:
titanic.fare_z.plot(figsize = (12,8))
titanic.age_z.plot(figsize = (12,8))
plt.show()

In [None]:
#titanic.to_csv("titanic_prep.csv", index = False)

In [None]:
titanic.head()

In [None]:
titanic.drop(labels = ["age", "alone", "child", "age_z", "fare_z", "fare_cat"], axis = 1, inplace = False)

### Creating Dummy Variables

In [None]:
titanic.head()

In [None]:
titanic.drop(labels = ["age", "alone", "child", "age_z", "fare_z", "fare_cat"], axis = 1, inplace = True)

In [None]:
titanic.head()

In [None]:
titanic_d = pd.get_dummies(titanic, columns = ["sex", "pclass", "embarked", "age_cat"], drop_first=True)

In [None]:
titanic_d.head()

In [None]:
titanic_d.info()

### String Operations

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head()

In [None]:
summer.Athlete = summer.Athlete.str.title()

In [None]:
summer.Athlete.str.split(", ", n = 1, expand = True)

In [None]:
summer[["Surname", "First_Name"]] = summer.Athlete.str.split(", ", n = 1, expand = True)

In [None]:
summer.head()

In [None]:
summer["Surname"] = summer.Surname.str.strip()

In [None]:
summer["First_Name"] = summer.First_Name.str.strip()

In [None]:
summer.drop(columns = "Athlete")