In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.axes_grid.anchored_artists import AnchoredText
import re
import folium
import progressbar
%matplotlib inline

In [None]:
# Setting up nice graph formatting
mpl.rcParams.update({
    'font.size'           : 20.0,
    'axes.titlesize'      : 'large',
    'axes.labelsize'      : 'medium',
    'xtick.labelsize'     : 'small',
    'ytick.labelsize'     : 'small',
    'legend.fontsize'     : 'small',
})

In [None]:
# Force pandas & numpy to display all data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_info_columns', 100000)
pd.set_option('max_seq_items', None)
np.set_printoptions(threshold=np.nan)

In [None]:
# Importing df 
df = pd.read_pickle('../data/all-datasets/original_df')

In [None]:
# Mapping customers
locations = df[['ga:latitude', 'ga:longitude']]
locationlist = locations.values.tolist()
locationlist = [x for x in locationlist if x[0]==x[0] and x[0]!=0.0]
cust_map = folium.Map()
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point]).add_to(cust_map)
cust_map

In [None]:
# Checking percentage of null values
nulls = df.isnull().sum()/float(df.shape[0])

# Breaking into x and y's for bar graph
nulls = pd.DataFrame(nulls)
categories = list(nulls.index)
values = list(nulls[0])

# Forming bar graph in seaborn
fig = plt.figure(figsize=(15,200))
ax = fig.add_subplot(111)
ax.set_xlabel('Proportion of Null Values')
ax.set_title('Proportion of Null Values by Feature')
sns.barplot(y=categories, x=values, orient='h', ax=ax, palette='pastel')
plt.savefig('../images/null_values.png', dpi=300)

In [None]:
# Checking for identifier columns to drop
lst = []
for x in df.columns:
    if 'id' in x:
        lst.append(x)
    elif 'email' in x:
        lst.append(x)
    elif 'name' in x:
        lst.append(x)
for column in lst:
    try:
        print(column, len(df[column].unique()))
        print(df[column].unique()[:20])
        print('\n\n')
    except:
        pass

In [None]:
# Identify (and later drop) columns with only one value OR one value with a nan
one_value = []
for col in df.columns:
    try:
        if len(df[col].unique()) == 1:
            one_value.append(col)
        elif len(df[col].unique()) == 2:
            for x in df[col].unique():
                # Identifies nans
                if x != x:
                    one_value.append(col)
                    break
    except:
        continue

for column in one_value:
    try:
        print(column)
        print(df[column].unique())
        print('\n\n')
    except:
        pass

In [None]:
# Set numerical & categorical values for use later
numerical_vals = df.select_dtypes(include=['float64', 'int64'])
for column in numerical_vals.columns:
    if len(numerical_vals[column].unique()) <= 3:
        numerical_vals.drop(column, inplace=True, axis=1)
categorical_vals = df.drop(numerical_vals, axis=1)

In [None]:
# Looking at currently coded as numeric columns that only have a few values or have id in the name
# (to see if they should be categorical)
for x in numerical_vals.columns:
    if len(df[x].unique())<10:
        print(x)
        print(df[x].unique())
        print('\n\n')
    elif 'id' in x:
        print(x)

In [None]:
# Reset types
numerical_vals = df.select_dtypes(include=['float64', 'int64'])
for column in numerical_vals.columns:
    if len(numerical_vals[column].unique()) <= 3:
        numerical_vals.drop(column, inplace=True, axis=1)
categorical_vals = df.drop(numerical_vals, axis=1)

In [None]:
# Quick fill numerical null values in order to make violinplots
for column in numerical_vals.columns:
    mean = df[column].mean()
    df[column] = df[column].fillna(mean)

In [None]:
# Violinplots of individual columns
bar = progressbar.ProgressBar()
for col in bar(numerical_vals):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    ax.set_title(col)
    sns.violinplot(x=df[col], orient='v', ax=ax, palette='pastel')
    text = '75th Percentile: {}\nMedian: {}\n25th Percentile: {}'.format(np.percentile(df[col], 75),\
            np.median(df[col]),np.percentile(df[col], 25))
    at = AnchoredText(text, prop=dict(size=15), frameon=True, loc=1)
    ax.add_artist(at)
    plt.savefig('../images/violinplot_{}'.format(col))

In [None]:
# Bar graphs of individual columns
bar = progressbar.ProgressBar()
for col in bar(categorical_vals):
    try:
        if len(df[col].unique())>100:
            continue
        else:
            height = len(df[col].unique())+10
            fig = plt.figure(figsize=(20, height))
            ax = fig.add_subplot(111)
            ax.set_title(col)
            sns.countplot(y=df[col], ax=ax, palette='pastel')
            plt.savefig('../images/bargraph_{}'.format(col))
    except:
        continue

In [None]:
# Violinplots of target by all object columns values
target = 'revenue:purchase_value'

# Violinplots of target by all object columns
bar = progressbar.ProgressBar()
for col in bar(categorical_vals):
    try:
        if len(df[col].unique())>100:
            continue
        else:
            height = len(df[col].unique())+10
            fig = plt.figure(figsize=(20, height))
            ax = fig.add_subplot(111)
            ax.set_title(col)
            sns.violinplot(y=train_df[col], x=df[target], orient='h', ax=ax, palette='pastel')
            plt.savefig('../images/{}_violinplot_{}'.format(target, col))
    except:
        continue