# IT Academy - Data Science with Python
## Sprint 6: Sampling Methods
### [Github Sampling Methods](https://github.com/jesussantana/Sampling)

[![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/)  
[![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?style=for-the-badge&logo=Jupyter)](https://jupyter.org/try)  
[![wakatime](https://wakatime.com/badge/github/jesussantana/Sampling.svg)](https://wakatime.com/badge/github/jesussantana/Sampling)

### Exercise 1:  

  - Grab a sports-themed dataset you like. Performs a sampling of the data generating a simple random sample and a systematic sample.

In [None]:
!pip install imblearn

import pandas as pd 
import numpy as np
from numpy import where
import random as rd
import warnings

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot
from numpy import where
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter

warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='deep')

In [None]:
pd.set_option('display.max_columns', None)

path = '../data/'
file = 'raw/MLB.txt'

df_raw= pd.read_table(path+file)

df = df_raw.copy()

df.head()

In [None]:
df.tail()

In [None]:
df.describe().round(2)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Store the real mean in a separate variable
real_mean = round(df['salary'].mean(),3)
real_mean

### Simple Random Sample

In [None]:
simple_random_sample= df.sample(frac=0.1, replace=True, random_state=6858).sort_values(by='salary', ascending=False)

In [None]:
simple_random_sample.head()

In [None]:
simple_random_sample.shape

In [None]:
simple_random_mean = round(simple_random_sample['salary'].mean(),3)
simple_random_mean

In [None]:
real_mean

In [None]:
abs(simple_random_sample.salary.mean()-df.salary.mean())/simple_random_sample.salary.mean()*100

In [None]:
simple_random_sample.position.value_counts(normalize=True)

In [None]:
df.position.value_counts(normalize=True)

In [None]:
k = int(np.ceil(1+np.log2(len(simple_random_sample))))  # Sturges Rule

plt.hist(simple_random_sample['salary'], bins= k)
plt.title(f"SAMPLE - MLB players’ salaries - Random Sample k={len(simple_random_sample)}")
plt.ylabel('Frequency')
plt.xlabel('SAMPLE - Salary (millions of dollars)')

In [None]:
#fig = plt.figure(figsize=(14,4))

ax1 = fig.add_subplot(2, 1, 1)
k1 = int(np.ceil(1+np.log2(len(df))))
# Plot
ax1 = df.hist(['salary'], bins=k1)
plt.title('POPULATION - Major League Baseball players’ salaries')
plt.ylabel('Frequency')
plt.xlabel('POPULATION - Salary (millions of dollars)')

ax2 = fig.add_subplot(2, 1, 2)

# Plot
ax2 = simple_random_sample.hist(['salary'], bins=k)
plt.title(f"SAMPLE - MLB players’ salaries - Random Sample k={len(simple_random_sample)}")
plt.ylabel('Frequency')
plt.xlabel('SAMPLE - Salary (millions of dollars)')

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=simple_random_sample["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3)
ax = sns.stripplot(x=simple_random_sample["salary"], orient='v', color='darkblue', alpha=0.1)
plt.title('SAMPLE - Major League Baseball players’ salaries')
plt.xlabel('Salary (millions of dollars)')

In [None]:
fig = plt.figure(figsize=(12,4))

ax1 = fig.add_subplot(1, 2, 1)

# Plot
sns.set(style="whitegrid")
ax1 = sns.boxplot(x=df["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3, notch=True)
ax1 = sns.stripplot(x=df["salary"], orient='v', color='darkblue', alpha=0.2)
plt.title('POPULATION - Major League Baseball players’ salaries')
plt.xlabel('POPULATION - Salary (millions of dollars)')

ax2 = fig.add_subplot(1, 2, 2)

# Plot
sns.set(style="whitegrid")
ax2 = sns.boxplot(x=simple_random_sample["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3, notch=True)
ax2 = sns.stripplot(x=simple_random_sample["salary"], orient='v', color='darkblue', alpha=0.2)
plt.title('SAMPLE - Major League Baseball players’ salaries')
plt.xlabel(f'SAMPLE - Salary (millions of dollars) - Random Sample k={len(simple_random_sample)}')

In [None]:
simple_random_sample.head()

### Systematic Sampling

In [None]:
# Define systematic sampling function
def systematic_sampling(df, step):
    
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    
    return systematic_sample

In [None]:
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df, 10).sort_values(by='salary', ascending=False)

In [None]:
# Save the sample mean in a separate variable
systematic_mean = round(systematic_sample['salary'].mean(),3)
systematic_mean

In [None]:
real_mean

In [None]:
# View sampled data frame
systematic_sample

In [None]:
systematic_sample.head()

### Cluster Sampling

In [None]:
def cluster_sampling(df, number_of_clusters):
    
    try:
        # Divide the units into cluster of equal size
        df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)

        # Create an empty list
        indexes = []

        # Append the indexes from the clusters that meet the criteria
        # For this formula, clusters id must be an even number
        for i in range(0,len(df)):
            if df['cluster_id'].iloc[i]%2 == 0:
                indexes.append(i)
        cluster_sample = df.iloc[indexes]
        return(cluster_sample)
    
    except:
        print("The population cannot be divided into clusters of equal size!")

In [None]:
# Obtain a cluster sample and save it in a new variable
cluster_sample = cluster_sampling(df, 6)

In [None]:
# Save the sample mean in a separate variable
cluster_mean = round(cluster_sample['salary'].mean(),3)

In [None]:
# View sampled data frame
cluster_sample

### Exercise 2: 
  - It continues with the sports theme data set and generates a stratified sample and a sample using SMOTE (Synthetic Minority Oversampling Technique).

In [None]:
systematic_sample.shape

### Stratified Random Sampling

In [None]:
# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=100)

In [None]:
# Perform data frame split
for x, y in split.split(df, df['position']):
    stratified_random_sample = df.iloc[y].sort_values(by='salary')

In [None]:
type(stratified_random_sample)

In [None]:
# View sampled data frame
stratified_random_sample

In [None]:
stratified_random_sample.groupby('position').mean()

In [None]:
stratified_random_sample.describe().round()


In [None]:
stratified_random_sample.position.unique()

In [None]:
stratified_random_sample.shape

In [None]:
stratified_mean = stratified_random_sample.salary.mean().round(2)
stratified_mean

In [None]:
real_mean

In [None]:
abs(stratified_random_sample.salary.mean()-df.salary.mean())/stratified_random_sample.salary.mean()*100


In [None]:
stratified_random_sample.position.value_counts(normalize=True)

### SMOTE (Synthetic Minority Oversampling Technique)

In [None]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=6858)

In [None]:
# summarize class distribution
counter = Counter(y)
print(counter)

In [None]:
# scatter plot of examples by class label
for label, _ in counter.items():
	row_ix = where(y == label)[0]
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
    
pyplot.legend()
pyplot.show()

## Data Summary

In [None]:
# Create a dictionary with the mean outcomes for each sampling method and the real mean
outcomes = {'sample_mean':[simple_random_mean, systematic_mean, stratified_mean, cluster_mean],'real_mean':real_mean}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=['Simple Random Sampling','Systematic Sampling','Stratified Sampling','Cluster Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['real_mean'] - outcomes['sample_mean'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')

In [None]:
fig = plt.figure(figsize=(20,8))
#fig = plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)

ax1 = fig.add_subplot(2, 2, 1)

# Plot
sns.set(style="whitegrid")
ax1 = sns.boxplot(x=df["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3, notch=True)
ax1 = sns.stripplot(x=df["salary"], orient='v', color='darkblue', alpha=0.2)
plt.title('POPULATION - Major League Baseball players’ salaries')
#plt.xlabel('POPULATION - Salary (millions of dollars)')

ax2 = fig.add_subplot(2, 2, 2)

# Plot
sns.set(style="whitegrid")
ax2 = sns.boxplot(x=simple_random_sample["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3, notch=True)
ax2 = sns.stripplot(x=simple_random_sample["salary"], orient='v', color='darkblue', alpha=0.2)
plt.title(f'SAMPLE - MLB players’ salaries - Random Sample k={len(simple_random_sample)}')
#plt.xlabel(f'SAMPLE - Salary (millions of dollars) - Random Sample k={len(simple_random_sample)}')

ax3 = fig.add_subplot(2, 2, 3)

# Plot
sns.set(style="whitegrid")
ax3 = sns.boxplot(x=systematic_sample["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3, notch=True)
ax3 = sns.stripplot(x=systematic_sample["salary"], orient='v', color='darkblue', alpha=0.2)
plt.title(f'Systematic - MLB players salaries Systematic Sample k={len(systematic_sample)}')
#plt.xlabel(f'Systematic - Salary (millions of dollars) - Systematic Sample k={len(systematic_sample)}')

ax4 = fig.add_subplot(2, 2, 4)

# Plot
sns.set(style="whitegrid")
ax4 = sns.boxplot(x=stratified_random_sample["salary"], color='lightblue', fliersize=5,  orient='v', linewidth=1 , width=0.3, notch=True)
ax4 = sns.stripplot(x=stratified_random_sample["salary"], orient='v', color='darkblue', alpha=0.2)
plt.title(f'Stratified - MLB players salaries - Stratified Sample k={len(stratified_random_sample)}')
#plt.xlabel(f'Stratified - Salary (millions of dollars) - Stratified Sample k={len(stratified_random_sample)}')