In [3]:
# Everything you might want to import... Remove what you don't use! 
# import calendar
# import datetime as dt 
# from dotenv import dotenv_values 
# import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
# import os 
import pandas as pd
# import requests
# from simpletsmodel import *
# from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import PolynomialFeatures
# from statsmodels.tsa.seasonal import seasonal_decompose
# import statsmodels.api as sm
# from utils import *

# Data Analysis Skeleton (TITLE)

This project is broken into 6 steps: 
1. [Problem Definition](#problem_definition)
2. [Gathering Data / Information](#get_data)
3. [Data Cleaning / Imputation](#clean)
4. [Exploratory Analysis](#explore)
5. [Model Fitting](#fit)
6. [Interpret Results](#interpret)

# 1. Problem definition <a id='problem_definition'></a>

# 2. Gathering Data / Information <a id='get_data'></a>

In [None]:
pd.read_csv('/path', sep=",")
pd.read_json('/path')


Connecting to SQL databases in Python: 

https://medium.com/@devartimahakalkar/connecting-sql-datasets-with-pandas-105f8eb68f1a

(Also saved to Desktop)

quick preview of the data: 
- `df.head()`
- `df.shape` #dataset composed of X rows and X columns
- `df.info()` #find data types and if there are any nulls
- `df.describe()` #get various summary statistics   
    "From the above descriptive statistics..."
- `df.column.unique()` #find if variable is discrete and categorical
- `df.column.value_counts()` #shows how many of each variable there is 


Merging data together: 
- `left.merge(right, how='inner', left_on='lkey', right_on='rkey')`

Creating column based on condition: 
- np.where(condition, value if condition is true, value if condition is false)
- `df['hasimage'] = np.where(df['photos']!= '[]', True, False)`

Merge reference [here](https://documentation.mindsphere.io/resources/html/predictive-learning/en-US/Images/Joins_Diagram.png). 

# 3. Data Cleaning / Imputation <a id='clean'></a>


**Data cleaning** 
Check for:
- Duplicates `duplicateRows = df[df.duplicated()]`,`df.drop_duplicates()`
- Data types `df.info()`  
  `df["column"] = pd.to_numeric(df["column"])`  
  `df["column"] = pd.to_datetime(df["column"])`  
  `df["column"] = df["column"].astype("string")`  

- Make timezone aware `Series.dt.tz_localize()`
- Meaningful column names 
- Outliers 

```
df[['Col_1','Col_1']].plot(subplots=True)

plt.tight_layout()
plt.show()
```
- Filter the outliers using expressions: 
  `df.loc[df['Col_1'] == 2]`
- Drop by index value: `df.drop([0, 1], inplace=True)` <--0,1 are index values 
- Drop by condition:  
```
indexAge = df[ (df['Age'] >= 20) & (df['Age'] <= 25) ].index
df.drop(indexAge , inplace=True)
```
- Drop unneeded columns `df.drop(columns=['B', 'C'])`

Data imputation steps: 
- Check for nulls
- Check for missing data 
- Impute if possible / necessary 

# 4. Exploratory Analysis <a id='explore'></a>

Plot the data: 
- Identify any trends / patterns 
- Is seasonality important? 
- Are there outliers that need explaining? 
- Is there evidence of a business cycle? 

Quick correlation matrix. 
- `df.corr()`
- `sns.heatmp(df.corr(), cmap='YlGnBu',annot=True)`
- If two variables are highly correlated, it may make sense to remove one from the model before performing a linear regression. 

box plot
- `df.boxplot(column=['Col1', 'Col2', 'Col3'])`
- show the distribution of quantative data 
- the box shows the quartiles of the dataset, while the whiskers extend to show the rest of the distribution 

In [None]:
#Quality correlation matrix
k = 12 #number of variables for heatmap
cols = df.corr().nlargest(k, 'quality')['quality'].index
cm = df[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')

In [None]:
#box plot
l = df.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='green',orient='v')
    plt.tight_layout()

Some summary statistics 

# 5. Model Fitting <a id='fit'></a>

Linear Regression

In [None]:
# demand array must be 2D, reshape() makes this nympy array 2D
# reshape is a numpy operator, so use .values to turn the pd series into a np array 
demand_x = demand_forecast['demand'].values.reshape(-1,1)
forecast_y = demand_forecast['PGE_forecast'].values.reshape(-1,1)
model = LinearRegression().fit(demand_x, forecast_y)
r_sq = model.score(demand_x, forecast_y)
print (f"R^2 value is {r_sq.round(3)}")

# 6. Interpret Results <a id='interpret'></a>