# END TO END MACHINE LEARNING PROJECT EXAMPLE

## Most of the time there are 8 steps you'll usually go through in ML
## 1. Look at the big Picture
## 2. Get the Data
## 3. Discover and Visusalise the data to get insights
## 4. Prepare the data for machine learning algorithm
## 5. Select a Model and train it
## 6. Fine Tune Your model
## 7. Present Your solution
## 8. Launch Monitor and maintain your system|

#### 

## Popular open data repositories
UC Irvine Machine Learning Repository

Kaggle datasets

Amazon’s AWS datasets

## Meta portals (they list open data repositories) Data Portal


 OpenDataMonit
    
 ndl

##  Other pages listing many popular open data repositoriesWikipedia’s list of Machine Learning datasets


Quora.co


The datasets subreditdditndl

# Before You begin any data science project you need to understand what the objective is

# 2. Get the Data

In [None]:
# For file manipulation 
from pathlib import Path
import pandas as pd
# For .tar files
import tarfile
# For getting infomation from the web through http(dowmload files)
import urllib.request 

In [None]:
def housing_data():
    # This takes into account the file path end
    path = Path('datasets/housing.tgz')

    # The condition if it isnt a file
    if not path.is_file():
        # This checks if its the parent and if it exist
        Path('datasets').mkdir(parents=True, exist_ok=True)

        url = 'https://github.com/ageron/data/raw/main/housing.tgz'
        urllib.request.urlretrive(url, path)

        with tarfile.open(path) as housing_tar:
            housing_tar.extractall(path='datasets')

    return pd.read_csv(Path('datasets/housing/housing.csv'))

housing_d = housing_data()
"""
The function checks if the file datasets/housing.tgz exists.
If the file doesn't exist, it downloads it from the provided URL.
It then extracts the contents of the .tgz file into the datasets/housing/ directory.
After extraction, the function reads the CSV file (housing.csv) into a pandas DataFrame and returns it.
"""

In [None]:
housing_d.head()

In [None]:
housing_d.info()
"""
The 
info()
method is useful to get a quick description of the data, in
particular the total number of rows, each attribute’s type, and the number of
nonnull values 
"""

In [None]:
housing_d['ocean_proximity'].value_counts()

In [None]:
housing_d.describe() # For numerical description of the data

In [None]:
# To save the images as high res images on the notebook
IMAGE_PATH = Path() / 'images' / 'end_to_end project'
IMAGE_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300):
    path = IMAGE_PATH / f'{fig_id}.{fig_extension}'
    if tight_layout:
        plt.tight_layout()
        plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
# Not necessary just for specification
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
# Very necessary
housing_d.hist(figsize=(20, 15), bins=50)
save_fig('Histogram of housing california')
plt.show()

# Its important to understand how the data was computed for isntance, the housing median income was scaled at 
# Many histograms are tail heavy as they extend much father to the right of the median  

In [None]:
# Creating a test set of the data you can use sklearn.model selection import train_test_split if you are familiar with it

import numpy as np

def train_test(data, test_ratio):
     shuffled_indicies = np.random.permutation(len(data))
     test_set_size = int(len(data) * test_ratio)
     test_indicies = shuffled_indicies[:test_set_size]
     train_indicies = shuffled_indicies[test_set_size:]
     return data.iloc[train_indicies],  data.iloc[test_indicies]

In [None]:
train, test = train_test(housing_d, 0.2)

In [None]:
print(f'No of rows of the train_set: {len(train)}')
train.head()

In [None]:
print(f'No of rows of the test_set: {len(test)}')
test.head()

## You do not want your machine learning algorithm to view the entire data set cuz evrytime you run the fxn you get a different set of data so we will be trying a new algorithm
we need to call np.random.permutation() before calling np.random.seed() is the solution.
Sadly, this won't guarantee that this notebook will output exactly the same results as in the notebook, since there are other possible sources of variation. The most important is the fact that algorithms get tweaked over time when libraries evolve. So please tolerate some minor differences: hopefully, most of the outputs should be the same, or at least in the right ballpark.

Note: another source of randomness is the order of Python sets: it is based on Python's `hash()` function, which is randomly "salted" when Python starts up (this started in Python 3.3, to prevent some denial-of-service attacks). To remove this randomness, the solution is to set the `PYTHONHASHSEED` environment variable to `"0"` _before_ Python even starts up. Nothing will happen if you do it after that. Luckily, if you're running this notebook on Colab, the variable is already set for you.


In [None]:
from zlib import crc32
def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2 ** 32


def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing_d.reset_index() # adds a new column called index

In [None]:
housing_with_id.head()

In [None]:
train_set, test_set = split_data_with_id_hash(id_column='index', data=housing_with_id, test_ratio=0.2)


In [None]:
train_set.head()

In [None]:
test_set.head()

In [None]:
housing_with_id['id'] = housing_d['longitude'] * 1000 + housing_d['latitude']
train_set, test_set = split_data_with_id_hash(id_column='id', data=housing_with_id, test_ratio=0.2)

In [None]:
from sklearn.model_selection import train_test_split
train_test, test_set = train_test_split(housing_d, test_size=0.2, random_state=101)

In [None]:
test_set['total_bedrooms'].isnull().sum()

In [None]:
train_set['total_bedrooms'].isnull().sum()

In [None]:
train_set

In [None]:
housing_d.isnull().sum()

In [None]:
from scipy.stats import binom

In [None]:
# extra code – shows how to compute the 10.7% proba of getting a bad sample

from scipy.stats import binom

sample_size = 1000
ratio_female = 0.511
proba_too_small = binom(sample_size, ratio_female).cdf(485 - 1)
proba_too_large = 1 - binom(sample_size, ratio_female).cdf(535)
print(proba_too_small + proba_too_large)

In [None]:
housing_d['Income Category'] = pd.cut(housing_d['median_income'], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

In [None]:
housing_d['Income Category'].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
save_fig("housing_income_cat_bar_plot")  # extra code
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
stratified_split = []
for train_index, test_index in split.split(housing_d, housing_d['Income Category']):
    train_set = housing_d.iloc[train_index]
    test_set = housing_d.iloc[test_index]
    stratified_split.append([train_set, test_set])
    

In [None]:
strat_train_set, strat_test_set = stratified_split[0]

In [None]:
strat_train_set['Income Category'].value_counts() / len(strat_test_set)

In [None]:
strat_train_set['Income Category'].value_counts()

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('Income Category', axis=1, inplace=True)

In [None]:
strat_train_set

# 3. Discover and Visualise the data to gain insight


In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter',  x='longitude', y='latitude')
save_fig('Bad visualisation plot')
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
save_fig('Good visualisation plot')
housing.plot(kind='scatter', x='longitude', y='latitude',
             grid=True, s=housing['population'] / 100, label='population',
            c='median_house_value', cmap='jet', colorbar=True, legend=True,
            sharex=False, figsize=[10, 7])
save_fig('scatter plot of housing prices')
plt.show()

In [None]:
# TO download california housing
file_name = 'california.png'
if not (IMAGE_PATH/file_name).is_file():
    github_root = 'https://github.com/ageron/handson-ml3/raw/main/'
    url = github_root + 'images/starters notebook' + file_name
    print(f'Downloading: {file_name}')
    urllib.request.urlretrieve(IMAGE_PATH/file_name)
    