# {{cookiecutter.project}} - General Notebook

## Goals

The main goal of the project is to...

## Imports

In [None]:
import os 
import re

import time
import datetime

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import altair as at

## Helper Methods

In [None]:
def timestamp():
    time = f"{datetime.datetime.now().date()}--{datetime.datetime.now().time().replace(microsecond=0)}"
    return time.replace(":", "-")

## Data Import

### Variables Meaning
| Name        | Meaning           | Note  |
| ------------- |-------------| -----|
| name     | meaning | note |

In [None]:
df = pd.read_csv("../data/.csv")
df = pd.read_excel('../data/.xlsx', index_col=None, header=0) 

## Data Export

In [None]:
pd
pd.to_excel("file.xlsx")

## Data Exploration

In [None]:
df.info()

In [None]:
df.header()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df["a"].value_counts()

In [None]:
pd.crosstab(df["a"], df["b"])

In [None]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_file("../reports/profile.html")

In [None]:
!start ../reports/profile.html

## Data Preperation

In [None]:
df = df.dropna()
df = df[df.["a"].notnull()]
df = df.dropna(axis="columns")

df = df.rename(columns={'ColumnA': 'SomeOtherName'})

df = df["a"].replace(1, 2)
df = df[df.["a"] == 1]
df = df[(df.["a"] >= 1) & (df.["b"] >= 1)]

df["a"] = df.apply(lambda x: 1 if x["b"] > 0 else 0, axis=1)
df["a"] = np.where(df["a"].isin(["a", "b"]), 1, 0)
df = df.assign(b=lambda x: a * 2)

df["a"], df["b"] = df1["c"].str.split("-").str

## Data Visualization

In [None]:
df.corr().style.background_gradient(cmap='coolwarm')

In [None]:
# https://seaborn.pydata.org/examples/index.html

plt.figure(figsize=(16,9))
plt.xticks(rotation=90)
sns.boxplot(x="a", y="b", data=df)

In [None]:
plt.figure(figsize=(16,9))
sns.regplot(x="a", y="b", data=df, scatter_kws={'alpha':0.05})

In [None]:
facegrid = sns.FacetGrid(df, col="manufacturer", col_wrap=5)
facegrid.map(plt.scatter, "year", "price")

In [None]:
# https://altair-viz.github.io/gallery/index.html

alt.Chart(df).mark_circle(size=60).encode(
    x='x:O',
    y='y:Q',
    color='c',
    tooltip=['a', 'x', 'y', 'x', 'Miles_per_Gallon']
).interactive()

## Modeling: Statsmodels

## Modeling: scikit-learn

#### Data Split

In [None]:
from sklearn.model_selection import train_test_split

df_model = df.drop(columns=[])
df_model = pd.get_dummies(df_model)

X_train, X_test, y_train, y_test = train_test_split(
    df_model.drop("y", axis=1),
    df_model["y"],
    test_size=0.2,
    random_state=42)

# Show the shapes of each split
print(x_df_train.shape)
print(x_df_test.shape)
print(y_df_train.shape)
print(y_df_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error

#### Training and Hyperparameter Tuning

In [None]:
sklearnOLS = LinearRegression()
sklearnOLS.fit(X_train, y_train)

y_train_pred = sklearnOLS.predict(X_train)
y_test_pred = sklearnOLS.predict(X_test)

print(f'Training set error:\n{math.sqrt(mean_squared_error(y_train, y_train_pred))}')
print(f'Test set error:\n{math.sqrt(mean_squared_error(y_test, y_test_pred))}')

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {"alpha" : list(np.append(np.arange(0, 1, 0.05),  np.arange(1, 21))),
            "fit_intercept" : [True, False],
            "normalize": [True],
            }

searchCV = GridSearchCV(Ridge(), 
                        parameters, 
                        cv=2,
                        n_jobs=-1,
                        verbose=10)

searchCV = searchCV.fit(X_train, y_train)

y_test_pred = searchCV.best_estimator_.predict(X_test)

print("The best model: ", searchCV.best_estimator_)
print(f'Test set error:\n{math.sqrt(mean_squared_error(y_test, y_test_pred))}')

#### Save Model

## Modeling: Tensorflow2