## Implementations

In [None]:
import pandas
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## Generating synthetic dataset

In [None]:
def func(X): return X**2

X = np.arange(0, 100)
y = func(X) + np.random.randint(10, size=(100)) * np.random.normal(100)
data = zip(X, y)

In [None]:
syn_data = pandas.DataFrame(data, columns=['X', 'y'])
print(syn_data.head())

In [None]:
print(syn_data.info())

In [None]:
print(syn_data.describe())

## Line Plot

In [None]:
plt.plot(syn_data['X'].values, syn_data['y'].values)

In [None]:
sns.lineplot(x='X', y='y', data=syn_data)

# Titanic dataset

In [None]:
titanic_data = sns.load_dataset('titanic')

In [None]:
print(titanic_data.head())

In [None]:
print(titanic_data.info())

### _Why should I get rid of None values?_

In [None]:
titanic_data = titanic_data.dropna()

In [None]:
print(titanic_data.describe())

### _Can I present this features with line plot?_ 

In [None]:
plt.plot(titanic_data['age'].values, titanic_data['fare'].values)

In [None]:
sns.lineplot(x='age', y='fare', data=titanic_data)

## Bar Plots / Catplots

In [None]:
sns.barplot(x='deck', y='fare', data=titanic_data)

In [None]:
# plt.bar(titanic_data['deck'], titanic_data['fare'])

In [None]:
encoding = {'deck': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E':4, 'F':5, 'G':6}}
titanic_data.replace(encoding, inplace=True)

In [None]:
plt.bar(titanic_data['deck'].values, titanic_data['fare'].values)

# Iris Dataset

In [None]:
iris_data = pandas.read_csv('iris.csv')
print(iris_data.head())

In [None]:
print(iris_data.describe(include='all'))

## Box Plot

In [None]:
setosa = iris_data[iris_data['species'] == 'setosa']
versicolor = iris_data[iris_data['species'] == 'versicolor']
virginica = iris_data[iris_data['species'] == 'virginica']

In [None]:
feature = 'sepal_length'

bp_data = [setosa[feature], versicolor[feature], virginica[feature]]
plt.boxplot(bp_data, vert=False, labels=['setosa', 'versicolor', 'virginica'])
plt.show()

In [None]:
sns.boxplot(x='sepal_length', y='species', data=iris_data)
plt.show()

In [None]:
sns.barplot(x='sepal_length', y='species', data=iris_data)
plt.show

## Scatter Plot

### _Which algorithm should I choose? Cluster or Regression?_

In [None]:
plt.scatter(setosa['sepal_length'].values, setosa['sepal_width'].values)
plt.scatter(virginica['sepal_length'].values, virginica['sepal_width'].values)
plt.scatter(versicolor['sepal_length'].values, versicolor['sepal_width'].values)
plt.show()

In [None]:
sns.scatterplot(x='sepal_length', y='sepal_width', data=setosa)
sns.scatterplot(x='sepal_length', y='sepal_width', data=virginica)
sns.scatterplot(x='sepal_length', y='sepal_width', data=versicolor)
plt.show()

In [None]:
plt.plot(setosa['sepal_length'].values, setosa['sepal_width'].values)
plt.plot(virginica['sepal_length'].values, virginica['sepal_width'].values)
plt.plot(versicolor['sepal_length'].values, versicolor['sepal_width'].values)
plt.show()

In [None]:
encoding_map = {'species': {'setosa': 0, 'versicolor': 1, 'virginica':2}}
iris_data.replace(encoding_map, inplace=True)

# House Prices Dataset

In [None]:
house_data = pandas.read_csv('HousePrices.csv')
print(house_data.head())

In [None]:
print(house_data.describe(include='all'))

## Violin Plot

In [None]:
sns.violinplot(x='FirePlace', y='Prices', data=house_data)

In [None]:
sns.violinplot(x='Garage', y='Prices', data=house_data)

## Correlation

In [None]:
features = ['Area', 'Garage', 'FirePlace', 'Garden', 'Prices']

In [None]:
sns.pairplot(house_data[features])

In [None]:
sns.heatmap(house_data[features].corr())

# Which dataset is better for which algorithm?

In [None]:
sns.lineplot(x='Area', y='Prices', data=house_data)

In [None]:
sns.lineplot(x='sepal_length', y='species', data=iris_data)

In [None]:
sns.lineplot(x='alive', y='fare', data=titanic_data)