# Data Science template to explore data and relationships

In [None]:
# You need to run this block of code to import the libraries that are needed for the data manipulations, visualizations, etc. 

from pydoc import help  # can type in the python console `help(name of function)` to get the documentation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy import stats
from IPython.display import display, HTML

## Import your data and start exploring
#### Note, the hash or pound sign makes the computer skip that line of code, we also use that sign for comments in a code block

In [None]:
# Read csv file into pandas dataframe

df = pd.read_csv("PutYourFileNameHere.csv")

In [None]:
# Run this block of code to view the first five rows of data. If you want more rows, then put the number in the parentheses
# For example, if you put a 10 in the parentheses, it will give you 10 rows. In Python the first row and column are numbered as zero

df.head()

In [None]:
# Create a function that we can re-use
# This block of code will create a histogram. You don't have to use all of this code next time you want to create one
def show_distribution(var_data):
    from matplotlib import pyplot as plt

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
                                                                                            mean_val,
                                                                                            med_val,
                                                                                            mod_val,
                                                                                            max_val))

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    # Plot the histogram   
    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot   
    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    # Add a title to the Figure
    fig.suptitle('Data Distribution')

    # Show the figure
    fig.show()


# For future histograms, just rerun these lines of code and update the variable name to be the variable you want 
# If you are using a new dataset with a different name, change df to whatever the name is for your dataset 
# Get the variable to examine
col = df['VarName']
# Call the function
show_distribution(col)

In [None]:
# You only need this part of the code above for new histograms
# Get the variable to examine
col = df['VarName']
# Call the function
show_distribution(col)

In [None]:
# Exploring relationships - this code will create scatter plots 
# Change Var1, etc. with the actual variable names. You can add more if you want. 
# If dataset name is something other than df, change df to the dataset name. 

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
  
# selecting numerical features
features = ['Var1', 'Var2', 'Var3', 'Var4']
   
# plotting the scatter matrix
# with the features
scatter_matrix(df[features], figsize=(12,12))
plt.xticks(rotation=90)
plt.show()

In [None]:
# If you want to compare groups based on one or more quantitative variables, use this code

#In the short_df = df[col_list] line of code, be sure to change df to whatever your dataset name is

col_list = ['GroupingVarName', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6']
short_df = df[col_list]

import seaborn as sns

rs=1999
 
df_long = pd.melt(short_df.sample(1000,random_state=rs), "GroupingVarName", var_name="Columns", value_name="Values")   
f,ax = plt.subplots(figsize=(16,8))
#plt.xticks(rotation=90) - use if you want to rotate the visualization
#plt.ylim(0, 10) - use if you want to limit the y axis to a min and/or max value
#plt.xlim(0, None) - use if you want to limit the x axis to a min and/or max value
sns.boxplot(x="Columns", y="Values", hue="GroupingVarName", data=df_long)

In [None]:
# Removing outliers - Use if you want to get rid of outliers and rerun the matrix
# You can add more lines for additional variables
# Typically we use 3 standard deviations (e.g., Six Sigma - 3 SDs above and below mean)

#Create a new dataset (df2) so that you don't replace the values in df
df2=df

from scipy import stats
df2=df2[(np.abs(stats.zscore(df2['Var1'])) < 3)]
df2=df2[(np.abs(stats.zscore(df2['Var2'])) < 3)]
df2=df2[(np.abs(stats.zscore(df2['Var3'])) < 3)]
df2=df2[(np.abs(stats.zscore(df2['Var4'])) < 3)]


In [None]:
# Use this if doing normalization - setting the scale to be between 0 and 1
# You would only do this if you want to standardize your variables and have all of them on same scale
# This would be preferred over creating z scores if your distribution is not normal

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Again, creating a new dataset so we don't replace the values in the original dataset
df2=df

# Applying scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['Var1', 'Var2', 'Var3', 'Var4']
df2[num_vars] = scaler.fit_transform(df2[num_vars])

df2

In [None]:
# Use this if doing standardization (z scores) - you would do this or the one above, but not both
# This code creates a z score that has zero as the mean. It is different than normalization

df['Var1'] = (df['Var1'] - df['Var1'].mean())/df['Var1'].std(ddof=0)

In [None]:
# Decomposition model to get seasonality, trend and noise
# This code creates a time series with 3 components (seasonal, trend, and noise)
# Change Var1 with your variable of interest. Change df to your dataset name if it is different

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

result=seasonal_decompose(df['Var1'], model='additive', freq=12, extrapolate_trend=12)


result.plot()
# pyplot.show()


In [None]:
# Run this code to save the three components so you can download to csv file. 

df['trend'] = result.trend

df['residual'] = result.resid

df['seasonal'] = result.seasonal

df.head()

In [None]:
# Run this code if you get errors about infinity values or missing values and rerun time series
# Note that this can cause you to end up with no values so you may need to try to transform, etc. instead of simply deleting

# Replace Inf values with NaN
df = df.replace([np.inf, -np.inf], np.nan)
# Drop all occurences of NaN
df = df.dropna()
# Double check these are all gone
df.isnull().any().any()

In [None]:
# Use this code to save your dataset to a csv file that will load into this folder

df.to_csv('dataset.csv')

## This section provides examples for multiple linear regression

In [None]:
# This is code to run a linear regression analysis
# Change df to be your dataset name if it is different
# Change Target to be your dependent variable
# Change Var1, Var2, Var3, Var4 to be the name of your predictor variables
# You can add more predictors if you want

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std


new_model = ols("""Target ~ Var1 
                                     + Var2 
                                     + Var3
                                     + Var4""", data=df).fit()
# summarize our model
new_model_summary = new_model.summary()
HTML(new_model_summary.as_html())

In [None]:
 #This code block runs regression diagnostic visualizations (partial plots)

fig = plt.figure(figsize=(20,12))
fig = sm.graphics.plot_partregress_grid(new_model, fig=fig)

In [None]:
# This produces our four regression plots for specified feature variable (e.g., one of your predictor variables)
# You would run this code for each predictor variable

fig = plt.figure(figsize=(15,8))

# pass in the model as the first parameter, then specify the 
# predictor variable we want to analyze
fig = sm.graphics.plot_regress_exog(new_model, "Var1", fig=fig)

## This section provides examples for decision trees 

In [None]:
# Creating a list of features for the decision tree
# Include in this list, all the predictor variables that you want to use in your decision tree

features=['Var1',
 'Var2',
 'Var3',
 'Var4',
 'Var5',
 'Var6']

In [None]:
# This block of code runs the decision tree analysis
# Replace TargetVar with the name of the target or dependent variable of interest
# max_depth and max_leaf_nodes are hyper parameters that are set by the researcher, they limit the size of the tree

from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn import datasets, tree

y = df["TargetVar"]
X = df[features].astype(float)
dt = DecisionTreeRegressor(min_samples_split=1000, min_samples_leaf=1000, random_state=99, max_depth=5, max_leaf_nodes=7)
dt.fit(X, y)

In [None]:
# Creating a visual of the tree

feature_names = list(X.columns)

import graphviz

dot_data = tree.export_graphviz(dt, out_file=None, 
                                feature_names=feature_names,  
                                filled=True)
graphviz.Source(dot_data, format="png") 